summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2014-10-31 17:55:21 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2014-10-31 17:55:21 +0200
commit196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c (patch)
treea35307258c8cd76cf4c41af630938943c2c47e09
parent7b999686ec4c732d28efd344065606fccba84ae4 (diff)
downloadffts-196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c.zip
ffts-196fb0c0c1541cf1ec1b5e9ff8ac0e8109fde29c.tar.gz
Add CMake as an alternative build system
Add support for Windows x64 (requires YASM)
-rw-r--r--CMakeLists.txt158
-rw-r--r--src/codegen.c1501
-rw-r--r--src/codegen.h27
-rw-r--r--src/codegen_sse.h240
-rw-r--r--src/ffts.c940
-rw-r--r--src/ffts.h273
-rw-r--r--src/macros-sse.h44
-rw-r--r--src/macros.h167
-rw-r--r--src/sse.s13
-rw-r--r--src/sse_win64.s840
10 files changed, 2878 insertions, 1325 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..365ec32
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,158 @@
+cmake_minimum_required(VERSION 2.8)
+
+project(ffts C ASM)
+
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+# default build type is Debug which means no optimization
+if(NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE "Release")
+endif(NOT CMAKE_BUILD_TYPE)
+
+# common options
+option(ENABLE_SSE
+ "Enables the use of SSE instructions." ON
+)
+
+option(ENABLE_NEON
+ "Enables the use of NEON instructions." OFF
+)
+
+option(ENABLE_VFP
+ "Enables the use of VFP instructions." OFF
+)
+
+option(DISABLE_DYNAMIC_CODE
+ "Disables the use of dynamic machine code generation." OFF
+)
+
+option(ENABLE_SHARED
+ "Enable building a shared library." OFF
+)
+
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pedantic -pipe -Wall")
+add_definitions(-DFFTS_CMAKE_GENERATED)
+
+include(CheckIncludeFile)
+include(CheckLibraryExists)
+
+if(MSVC)
+ add_definitions(-D_USE_MATH_DEFINES)
+else()
+ # some systems need libm for some of the math functions to work
+ check_library_exists(m pow "" HAVE_LIBM)
+ if(HAVE_LIBM)
+ list(APPEND CMAKE_REQUIRED_LIBRARIES m)
+ list(APPEND FFTS_EXTRA_LIBRARIES m)
+ endif(HAVE_LIBM)
+endif(MSVC)
+
+include_directories(src)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+set(FFTS_SOURCES
+ src/ffts_attributes.h
+ src/ffts.c
+ src/ffts.h
+ src/ffts_nd.c
+ src/ffts_nd.h
+ src/ffts_real.h
+ src/ffts_real.c
+ src/ffts_real_nd.c
+ src/ffts_real_nd.h
+ src/ffts_small.c
+ src/macros.h
+ src/patterns.c
+ src/patterns.h
+ src/types.h
+)
+
+if(ENABLE_SSE)
+ list(APPEND FFTS_SOURCES
+ src/macros-sse.h
+ )
+
+ if(MSVC)
+ set(CMAKE_ASM-ATT_COMPILER yasm)
+ enable_language(ASM-ATT)
+
+ add_custom_command(
+ OUTPUT sse_win64.obj
+ COMMAND ${CMAKE_ASM-ATT_COMPILER} -f win64 -m amd64
+ -o ${CMAKE_CURRENT_BINARY_DIR}/sse_win64.obj -p gas
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/sse_win64.s
+ DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/sse_win64.s
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ COMMENT "Generating sse_win64.obj"
+ )
+
+ list(APPEND FFTS_SOURCES
+ ${CMAKE_CURRENT_BINARY_DIR}/sse_win64.obj
+ src/sse_win64.s
+ )
+ else()
+ list(APPEND FFTS_SOURCES
+ src/sse.s
+ )
+ endif(MSVC)
+
+ add_definitions(-D_USE_MATH_DEFINES)
+ add_definitions(-D__x86_64__)
+ add_definitions(-DHAVE_SSE -msse2)
+endif()
+
+if(ENABLE_NEON)
+ if(DISABLE_DYNAMIC_CODE)
+ list(APPEND FFTS_SOURCES
+ source/neon_static_f.s
+ source/neon_static_i.s
+ )
+ else()
+ list(APPEND FFTS_SOURCES
+ source/neon.s
+ source/arch/neon.c
+ )
+ endif()
+
+ add_definitions(-DHAVE_NEON)
+endif()
+
+if(ENABLE_VFP)
+ list(APPEND FFTS_SOURCES
+ source/vfp.s
+ source/arch/vfp.c
+ )
+
+ add_definitions(-DHAVE_VFP)
+endif()
+
+if(ENABLE_SINGLE)
+ add_definitions(-DHAVE_SINGLE)
+endif()
+
+if(DISABLE_DYNAMIC_CODE)
+ list(APPEND FFTS_SOURCES
+ src/ffts_static.c
+ )
+
+ add_definitions(-DDYNAMIC_DISABLED)
+else()
+ list(APPEND FFTS_SOURCES
+ src/codegen.c
+ src/codegen.h
+ )
+endif()
+
+add_library(ffts_static
+ ${FFTS_SOURCES}
+)
+
+add_executable(ffts_test
+ tests/test.c
+)
+
+target_link_libraries(ffts_test
+ ffts_static
+ ${FFTS_EXTRA_LIBRARIES}
+) \ No newline at end of file
diff --git a/src/codegen.c b/src/codegen.c
index 79aaca6..0cc3d24 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -1,10 +1,10 @@
/*
-
+
This file is part of FFTS -- The Fastest Fourier Transform in the South
-
+
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
-
+ Copyright (c) 2012, The University of Waikato
+
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -35,698 +35,951 @@
#include "macros.h"
#include "ffts.h"
-#ifdef __APPLE__
- #include <libkern/OSCacheControl.h>
-#endif
-
-#include <sys/types.h>
-#include <sys/mman.h>
-
#ifdef HAVE_NEON
- #include "codegen_arm.h"
- #include "neon.h"
+#include "codegen_arm.h"
+#include "neon.h"
#elif HAVE_VFP
- #include "codegen_arm.h"
- #include "vfp.h"
+#include "codegen_arm.h"
+#include "vfp.h"
#else
- #include "codegen_sse.h"
- #include "macros-sse.h"
+#include "codegen_sse.h"
+#include "macros-sse.h"
#endif
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+/* #include <stdio.h> */
+#include <stdlib.h>
+#include <string.h>
+
#ifdef __ANDROID__
- #include <unistd.h>
+#include <unistd.h>
#endif
-int tree_count(int N, int leafN, int offset) {
-
- if(N <= leafN) return 0;
- int count = 0;
- count += tree_count(N/4, leafN, offset);
- count += tree_count(N/8, leafN, offset + N/4);
- count += tree_count(N/8, leafN, offset + N/4 + N/8);
- count += tree_count(N/4, leafN, offset + N/2);
- count += tree_count(N/4, leafN, offset + 3*N/4);
-
- return 1 + count;
-}
-
-void elaborate_tree(size_t **p, int N, int leafN, int offset) {
-
- if(N <= leafN) return;
- elaborate_tree(p, N/4, leafN, offset);
- elaborate_tree(p, N/8, leafN, offset + N/4);
- elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
- elaborate_tree(p, N/4, leafN, offset + N/2);
- elaborate_tree(p, N/4, leafN, offset + 3*N/4);
-
- (*p)[0] = N;
- (*p)[1] = offset*2;
+#ifdef __arm__
+typedef uint32_t insns_t;
+#else
+typedef uint8_t insns_t;
+#endif
- (*p)+=2;
-}
+#define P(x) (*(*p)++ = x)
+static int ffts_tree_count(int N, int leaf_N, int offset)
+{
+ int count;
+ if (N <= leaf_N) {
+ return 0;
+ }
+ count = ffts_tree_count(N/4, leaf_N, offset);
+ count += ffts_tree_count(N/8, leaf_N, offset + N/4);
+ count += ffts_tree_count(N/8, leaf_N, offset + N/4 + N/8);
+ count += ffts_tree_count(N/4, leaf_N, offset + N/2);
+ count += ffts_tree_count(N/4, leaf_N, offset + 3*N/4);
-uint32_t LUT_offset(size_t N, size_t leafN) {
- int i;
- size_t p_lut_size = 0;
- size_t lut_size = 0;
- int hardcoded = 0;
- size_t n_luts = __builtin_ctzl(N/leafN);
- int n = leafN*2;
- //if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
-
- for(i=0;i<n_luts-1;i++) {
- p_lut_size = lut_size;
- if(!i || hardcoded) {
- #ifdef __arm__
- if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
- else lut_size += n/4 * sizeof(cdata_t);
- #else
- lut_size += n/4 * 2 * sizeof(cdata_t);
- #endif
- // n *= 2;
- } else {
- #ifdef __arm__
- lut_size += n/8 * 3 * sizeof(cdata_t);
- #else
- lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
- #endif
- }
- n *= 2;
- }
- return lut_size;
+ return 1 + count;
}
-#ifdef __arm__
- typedef uint32_t insns_t;
-#else
- typedef uint8_t insns_t;
-#endif
+static void ffts_elaborate_tree(size_t **p, int N, int leaf_N, int offset)
+{
+ if (N <= leaf_N) {
+ return;
+ }
-#define P(x) (*(*p)++ = x)
+ ffts_elaborate_tree(p, N/4, leaf_N, offset);
+ ffts_elaborate_tree(p, N/8, leaf_N, offset + N/4);
+ ffts_elaborate_tree(p, N/8, leaf_N, offset + N/4 + N/8);
+ ffts_elaborate_tree(p, N/4, leaf_N, offset + N/2);
+ ffts_elaborate_tree(p, N/4, leaf_N, offset + 3*N/4);
+
+ (*p)[0] = N;
+ (*p)[1] = 2 * offset;
-void insert_nops(uint8_t **p, uint32_t count) {
- switch(count) {
- case 0: break;
- case 2: P(0x66);
- case 1: P(0x90); break;
- case 3: P(0x0F); P(0x1F); P(0x00); break;
- case 4: P(0x0F); P(0x1F); P(0x40); P(0x00); break;
- case 5: P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;
- case 6: P(0x66); P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;
- case 7: P(0x0F); P(0x1F); P(0x80); P(0x00); P(0x00); P(0x00); P(0x00); break;
- case 8: P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;
- case 9: P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;
- default:
- P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00);
- insert_nops(p, count-9);
- break;
- }
+ (*p) += 2;
}
+static void ffts_insert_nops(uint8_t **p, uint32_t count)
+{
+ if (count >= 9) {
+ P(0x66);
+ P(0x0F);
+ P(0x1F);
+ P(0x84);
+ P(0x00);
+ P(0x00);
+ P(0x00);
+ P(0x00);
+ P(0x00);
+
+ if (count > 9) {
+ ffts_insert_nops(p, count - 9);
+ }
+ } else {
+ switch(count) {
+ case 0:
+ break;
+ case 2:
+ P(0x66);
+ /* fall through */
+ case 1:
+ P(0x90);
+ break;
+ case 3:
+ P(0x0F);
+ P(0x1F);
+ P(0x00);
+ break;
+ case 4:
+ P(0x0F);
+ P(0x1F);
+ P(0x40);
+ P(0x00);
+ break;
+ case 5:
+ P(0x0F);
+ P(0x1F);
+ P(0x44);
+ P(0x00);
+ P(0x00);
+ break;
+ case 6:
+ P(0x66);
+ P(0x0F);
+ P(0x1F);
+ P(0x44);
+ P(0x00);
+ P(0x00);
+ break;
+ case 7:
+ P(0x0F);
+ P(0x1F);
+ P(0x80);
+ P(0x00);
+ P(0x00);
+ P(0x00);
+ P(0x00);
+ break;
+ case 8:
+ default:
+ P(0x0F);
+ P(0x1F);
+ P(0x84);
+ P(0x00);
+ P(0x00);
+ P(0x00);
+ P(0x00);
+ P(0x00);
+ break;
+ }
+ }
+}
-void align_mem16(uint8_t **p, uint32_t offset) {
+static void ffts_align_mem16(uint8_t **p, uint32_t offset)
+{
#ifdef __x86_64__
- int r = (16 - (offset & 0xf)) - ((uint32_t)(*p) & 0xf);
- r = (16 + r) & 0xf;
- insert_nops(p, r);
+ int r = (16 - (offset & 0xf)) - ((uintptr_t)(*p) & 0xf);
+ r = (16 + r) & 0xf;
+ ffts_insert_nops(p, r);
#endif
}
-void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
- int count = tree_count(N, leafN, 0) + 1;
- size_t *ps = malloc(count * 2 * sizeof(size_t));
- size_t *pps = ps;
+transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
+{
+ uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
+ uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4};
+
+ int32_t pAddr = 0;
+ int32_t pN = 0;
+ int32_t pLUT = 0;
+
+ insns_t *fp;
+ insns_t *start;
+ insns_t *x_4_addr;
+ insns_t *x_8_addr;
+ uint32_t loop_count;
+
+ int count;
+ int i;
+ ptrdiff_t len;
+
+ size_t *ps;
+ size_t *pps;
+
+ count = ffts_tree_count(N, leaf_N, 0) + 1;
+
+ ps = pps = malloc(2 * count * sizeof(*ps));
+ if (!ps) {
+ return NULL;
+ }
+
+ ffts_elaborate_tree(&pps, N, leaf_N, 0);
+
+ pps[0] = 0;
+ pps[1] = 0;
+
+ pps = ps;
#ifdef __x86_64__
- if(sign < 0) p->constants = sse_constants;
- else p->constants = sse_constants_inv;
+ if (sign < 0) {
+ p->constants = sse_constants;
+ } else {
+ p->constants = sse_constants_inv;
+ }
#endif
- elaborate_tree(&pps, N, leafN, 0);
- pps[0] = 0;
- pps[1] = 0;
+ fp = (insns_t*) p->transform_base;
- pps = ps;
+#ifdef __arm__
+#ifdef HAVE_NEON
+ memcpy(fp, neon_x8, neon_x8_t - neon_x8);
+ /*
+ * Changes adds to subtracts and vice versa to allow the computation
+ * of both the IFFT and FFT
+ */
+ if(sign < 0) {
+ fp[31] ^= 0x00200000;
+ fp[32] ^= 0x00200000;
+ fp[33] ^= 0x00200000;
+ fp[34] ^= 0x00200000;
+ fp[65] ^= 0x00200000;
+ fp[66] ^= 0x00200000;
+ fp[70] ^= 0x00200000;
+ fp[74] ^= 0x00200000;
+ fp[97] ^= 0x00200000;
+ fp[98] ^= 0x00200000;
+ fp[102] ^= 0x00200000;
+ fp[104] ^= 0x00200000;
+ }
+ fp += (neon_x8_t - neon_x8) / 4;
+#else
+ memcpy(fp, vfp_x8, vfp_end - vfp_x8);
+ if(sign > 0) {
+ fp[65] ^= 0x00000040;
+ fp[66] ^= 0x00000040;
+ fp[68] ^= 0x00000040;
+ fp[70] ^= 0x00000040;
+ fp[103] ^= 0x00000040;
+ fp[104] ^= 0x00000040;
+ fp[105] ^= 0x00000040;
+ fp[108] ^= 0x00000040;
+ fp[113] ^= 0x00000040;
+ fp[114] ^= 0x00000040;
+ fp[117] ^= 0x00000040;
+ fp[118] ^= 0x00000040;
+ }
+ fp += (vfp_end - vfp_x8) / 4;
+#endif
+#else
+ /* align call destination */
+ ffts_align_mem16(&fp, 0);
+ x_8_addr = fp;
-#ifdef __arm__
- if(N < 8192) p->transform_size = 8192;
- else p->transform_size = N;
+ /* align loop/jump destination */
+#ifdef _M_AMD64
+ ffts_align_mem16(&fp, 6);
#else
- if(N < 2048) p->transform_size = 16384;
- else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N);
+ ffts_align_mem16(&fp, 5);
+#endif
+
+ /* copy function */
+ assert((char*) x8_soft_end > (char*) x8_soft);
+ len = (char*) x8_soft_end - (char*) x8_soft;
+ memcpy(fp, x8_soft, (size_t) len);
+ fp += len;
#endif
+ //uint32_t *x_8_t_addr = fp;
+ //memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
+ //fp += (neon_end - neon_x8_t) / 4;
-#ifdef __APPLE__
- p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0);
+#ifdef __arm__
+#ifdef HAVE_NEON
+ memcpy(fp, neon_x4, neon_x8 - neon_x4);
+ if(sign < 0) {
+ fp[26] ^= 0x00200000;
+ fp[28] ^= 0x00200000;
+ fp[31] ^= 0x00200000;
+ fp[32] ^= 0x00200000;
+ }
+ fp += (neon_x8 - neon_x4) / 4;
#else
-#define MAP_ANONYMOUS 0x20
- p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+ memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
+ if(sign > 0) {
+ fp[36] ^= 0x00000040;
+ fp[38] ^= 0x00000040;
+ fp[43] ^= 0x00000040;
+ fp[44] ^= 0x00000040;
+ }
+ fp += (vfp_x8 - vfp_x4) / 4;
+#endif
+#else
+ /* align call destination */
+ ffts_align_mem16(&fp, 0);
+ x_4_addr = fp;
+
+ /* copy function */
+ assert((char*) x8_soft > (char*) x4);
+ len = (char*) x8_soft - (char*) x4;
+ memcpy(fp, x4, (size_t) len);
+ fp += len;
#endif
-/*
- if(p->transform_base == MAP_FAILED) {
- fprintf(stderr, "MAP FAILED\n");
- exit(1);
- }*/
- insns_t *func = p->transform_base;//valloc(8192);
- insns_t *fp = func;
-
-//fprintf(stderr, "Allocating %d bytes \n", p->transform_size);
-//fprintf(stderr, "Base address = %016p\n", func);
-
- if(!func) {
- fprintf(stderr, "NOMEM\n");
- exit(1);
- }
-
- insns_t *x_8_addr = fp;
#ifdef __arm__
+ start = fp;
+
+ *fp = PUSH_LR();
+ fp++;
+ *fp = 0xed2d8b10;
+ fp++;
+
+ ADDI(&fp, 3, 1, 0);
+ ADDI(&fp, 7, 1, N);
+ ADDI(&fp, 5, 1, 2*N);
+ ADDI(&fp, 10, 7, 2*N);
+ ADDI(&fp, 4, 5, 2*N);
+ ADDI(&fp, 8, 10, 2*N);
+ ADDI(&fp, 6, 4, 2*N);
+ ADDI(&fp, 9, 8, 2*N);
+
+ *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p));
+ fp++; // load offsets into r12
+ // *fp++ = LDRI(1, 0, 4); // load ws into r1
+ ADDI(&fp, 1, 0, 0);
+
+ ADDI(&fp, 0, 2, 0), // mov out into r0
+ *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
+ fp++;
+
#ifdef HAVE_NEON
- memcpy(fp, neon_x8, neon_x8_t - neon_x8);
- /*
- * Changes adds to subtracts and vice versa to allow the computation
- * of both the IFFT and FFT
- */
- if(sign < 0) {
- fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
- fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
- fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
- }
- fp += (neon_x8_t - neon_x8) / 4;
+ MOVI(&fp, 11, p->i0);
+#else
+ MOVI(&fp, 11, p->i0);
+#endif
+#else
+ /* align call destination */
+ ffts_align_mem16(&fp, 0);
+ start = fp;
+
+ /* save nonvolatile registers */
+#ifdef _M_AMD64
+ /* use the shadow space to save first 3 registers */
+
+ /* mov [rsp + 8], rbx */
+ *fp++ = 0x48;
+ *fp++ = 0x89;
+ *fp++ = 0x5C;
+ *fp++ = 0x24;
+ *fp++ = 0x08;
+
+ /* mov [rsp + 16], rsi */
+ *fp++ = 0x48;
+ *fp++ = 0x89;
+ *fp++ = 0x74;
+ *fp++ = 0x24;
+ *fp++ = 0x10;
+
+ /* mov [rsp + 24], rdi */
+ *fp++ = 0x48;
+ *fp++ = 0x89;
+ *fp++ = 0x7C;
+ *fp++ = 0x24;
+ *fp++ = 0x18;
#else
- memcpy(fp, vfp_x8, vfp_end - vfp_x8);
- if(sign > 0) {
- fp[65] ^= 0x00000040;
- fp[66] ^= 0x00000040;
- fp[68] ^= 0x00000040;
- fp[70] ^= 0x00000040;
- fp[103] ^= 0x00000040;
- fp[104] ^= 0x00000040;
- fp[105] ^= 0x00000040;
- fp[108] ^= 0x00000040;
- fp[113] ^= 0x00000040;
- fp[114] ^= 0x00000040;
- fp[117] ^= 0x00000040;
- fp[118] ^= 0x00000040;
- }
- fp += (vfp_end - vfp_x8) / 4;
+ PUSH(&fp, RBP);
+ PUSH(&fp, RBX);
+ PUSH(&fp, R10);
+ PUSH(&fp, R11);
+ PUSH(&fp, R12);
+ PUSH(&fp, R13);
+ PUSH(&fp, R14);
+ PUSH(&fp, R15);
#endif
+
+ /* assign loop counter register */
+ loop_count = p->i0 * 4;
+#ifdef _M_AMD64
+ MOVI(&fp, EBX, loop_count);
#else
- align_mem16(&fp, 0);
- x_8_addr = fp;
- align_mem16(&fp, 5);
- memcpy(fp, x8_soft, x8_hard - x8_soft);
- fp += (x8_hard - x8_soft);
-//fprintf(stderr, "X8 start address = %016p\n", x_8_addr);
+ MOVI(&fp, ECX, loop_count);
+#endif
#endif
-//uint32_t *x_8_t_addr = fp;
-//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
-//fp += (neon_end - neon_x8_t) / 4;
- insns_t *x_4_addr = fp;
+
#ifdef __arm__
- #ifdef HAVE_NEON
- memcpy(fp, neon_x4, neon_x8 - neon_x4);
- if(sign < 0) {
- fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
- }
- fp += (neon_x8 - neon_x4) / 4;
- #else
- memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
- if(sign > 0) {
- fp[36] ^= 0x00000040;
- fp[38] ^= 0x00000040;
- fp[43] ^= 0x00000040;
- fp[44] ^= 0x00000040;
- }
- fp += (vfp_x8 - vfp_x4) / 4;
- #endif
+#ifdef HAVE_NEON
+ memcpy(fp, neon_ee, neon_oo - neon_ee);
+ if (sign < 0) {
+ fp[33] ^= 0x00200000;
+ fp[37] ^= 0x00200000;
+ fp[38] ^= 0x00200000;
+ fp[39] ^= 0x00200000;
+ fp[40] ^= 0x00200000;
+ fp[41] ^= 0x00200000;
+ fp[44] ^= 0x00200000;
+ fp[45] ^= 0x00200000;
+ fp[46] ^= 0x00200000;
+ fp[47] ^= 0x00200000;
+ fp[48] ^= 0x00200000;
+ fp[57] ^= 0x00200000;
+ }
+
+ fp += (neon_oo - neon_ee) / 4;
+#else
+ memcpy(fp, vfp_e, vfp_o - vfp_e);
+
+ if (sign > 0) {
+ fp[64] ^= 0x00000040;
+ fp[65] ^= 0x00000040;
+ fp[68] ^= 0x00000040;
+ fp[75] ^= 0x00000040;
+ fp[76] ^= 0x00000040;
+ fp[79] ^= 0x00000040;
+ fp[80] ^= 0x00000040;
+ fp[83] ^= 0x00000040;
+ fp[84] ^= 0x00000040;
+ fp[87] ^= 0x00000040;
+ fp[91] ^= 0x00000040;
+ fp[93] ^= 0x00000040;
+ }
+ fp += (vfp_o - vfp_e) / 4;
+#endif
#else
- align_mem16(&fp, 0);
- x_4_addr = fp;
- memcpy(fp, x4, x8_soft - x4);
- fp += (x8_soft - x4);
+ //fprintf(stderr, "Body start address = %016p\n", start);
+ /* copy function */
+ assert((char*) leaf_ee > (char*) leaf_ee_init);
+ len = (char*) leaf_ee - (char*) leaf_ee_init;
+ memcpy(fp, leaf_ee_init, (size_t) len);
+ fp += len;
+
+ /* align loop/jump destination */
+#ifdef _M_AMD64
+ ffts_align_mem16(&fp, 8);
+#else
+ ffts_align_mem16(&fp, 9);
#endif
- insns_t *start = fp;
-
-#ifdef __arm__
- *fp = PUSH_LR(); fp++;
- *fp = 0xed2d8b10; fp++;
-
- ADDI(&fp, 3, 1, 0);
- ADDI(&fp, 7, 1, N);
- ADDI(&fp, 5, 1, 2*N);
- ADDI(&fp, 10, 7, 2*N);
- ADDI(&fp, 4, 5, 2*N);
- ADDI(&fp, 8, 10, 2*N);
- ADDI(&fp, 6, 4, 2*N);
- ADDI(&fp, 9, 8, 2*N);
-
- *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12
-// *fp++ = LDRI(1, 0, 4); // load ws into r1
- ADDI(&fp, 1, 0, 0);
-
- ADDI(&fp, 0, 2, 0), // mov out into r0
+
+ /* copy function */
+ assert((char*) leaf_oo > (char*) leaf_ee);
+ len = (char*) leaf_oo - (char*) leaf_ee;
+ memcpy(fp, leaf_ee, (size_t) len);
+
+ /* patch offsets */
+ for (i = 0; i < 8; i++) {
+ IMM32_NI(fp + sse_leaf_ee_offsets[i], 4 * offsets[i]);
+ }
+
+ fp += len;
+
+ if (ffts_ctzl(N) & 1) {
+ if (p->i1) {
+ loop_count += 4 * p->i1;
+
+ /* align loop/jump destination */
+#ifdef _M_AMD64
+ MOVI(&fp, EBX, loop_count);
+ ffts_align_mem16(&fp, 3);
+#else
+ MOVI(&fp, ECX, loop_count);
+ ffts_align_mem16(&fp, 4);
#endif
+ /* copy function */
+ assert((char*) leaf_eo > (char*) leaf_oo);
+ len = (char*) leaf_eo - (char*) leaf_oo;
+ memcpy(fp, leaf_oo, len);
-#ifdef __arm__
- *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
- #ifdef HAVE_NEON
- MOVI(&fp, 11, p->i0);
- #else
- MOVI(&fp, 11, p->i0);
- #endif
+ /* patch offsets */
+ for (i = 0; i < 8; i++) {
+ IMM32_NI(fp + sse_leaf_oo_offsets[i], 4 * offsets_o[i]);
+ }
+
+ fp += len;
+ }
+
+ loop_count += 4;
+
+ /* copy function */
+ assert((char*) leaf_end > (char*) leaf_oe);
+ len = (char*) leaf_end - (char*) leaf_oe;
+ memcpy(fp, leaf_oe, len);
+
+ /* patch offsets */
+ for (i = 0; i < 8; i++) {
+ IMM32_NI(fp + sse_leaf_oe_offsets[i], 4 * offsets_o[i]);
+ }
+ fp += len;
+ } else {
+ loop_count += 4;
+
+ /* copy function */
+ assert((char*) leaf_oe > (char*) leaf_eo);
+ len = (char*) leaf_oe - (char*) leaf_eo;
+ memcpy(fp, leaf_eo, len);
+
+ /* patch offsets */
+ for (i = 0; i < 8; i++) {
+ IMM32_NI(fp + sse_leaf_eo_offsets[i], 4 * offsets[i]);
+ }
+
+ fp += len;
+
+ if (p->i1) {
+ loop_count += 4 * p->i1;
+
+ /* align loop/jump destination */
+#ifdef _M_AMD64
+ MOVI(&fp, EBX, loop_count);
+ ffts_align_mem16(&fp, 3);
#else
- align_mem16(&fp, 0);
- start = fp;
-
- *fp++ = 0x4c;
- *fp++ = 0x8b;
- *fp++ = 0x07;
- uint32_t lp_cnt = p->i0 * 4;
- MOVI(&fp, RCX, lp_cnt);
-
- //LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p));
+ MOVI(&fp, ECX, loop_count);
+ ffts_align_mem16(&fp, 4);
#endif
- //fp++;
-#ifdef __arm__
-#ifdef HAVE_NEON
- memcpy(fp, neon_ee, neon_oo - neon_ee);
- if(sign < 0) {
- fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
- fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
- fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
- }
- fp += (neon_oo - neon_ee) / 4;
+
+ /* copy function */
+ assert((char*) leaf_eo > (char*) leaf_oo);
+ len = (char*) leaf_eo - (char*) leaf_oo;
+ memcpy(fp, leaf_oo, len);
+
+ for (i = 0; i < 8; i++) {
+ IMM32_NI(fp + sse_leaf_oo_offsets[i], 4 * offsets_o[i]);
+ }
+
+ fp += len;
+ }
+ }
+
+ if (p->i1) {
+ uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2};
+
+ loop_count += 4 * p->i1;
+
+ /* align loop/jump destination */
+#ifdef _M_AMD64
+ MOVI(&fp, EBX, loop_count);
+ ffts_align_mem16(&fp, 8);
#else
- memcpy(fp, vfp_e, vfp_o - vfp_e);
- if(sign > 0) {
- fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
- fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
- fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
- }
- fp += (vfp_o - vfp_e) / 4;
+ MOVI(&fp, ECX, loop_count);
+ ffts_align_mem16(&fp, 9);
#endif
+
+ assert((char*) leaf_oo > (char*) leaf_ee);
+ len = (char*) leaf_oo - (char*) leaf_ee;
+ memcpy(fp, leaf_ee, len);
+
+ for (i = 0; i < 8; i++) {
+ IMM32_NI(fp + sse_leaf_ee_offsets[i], 4 * offsets_oe[i]);
+ }
+
+ fp += len;
+ }
+
+ //fprintf(stderr, "Body start address = %016p\n", fp);
+ //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p));
+ memcpy(fp, x_init, (char*) x4 - (char*) x_init);
+ //IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp ));
+ fp += ((char*) x4 - (char*) x_init);
+
+ count = 2;
+ while (pps[0]) {
+ size_t ws_is;
+
+ if (!pN) {
+#ifdef _M_AMD64
+ MOVI(&fp, EBX, pps[0]);
#else
-//fprintf(stderr, "Body start address = %016p\n", start);
-
- PUSH(&fp, RBP);
- PUSH(&fp, RBX);
- PUSH(&fp, R10);
- PUSH(&fp, R11);
- PUSH(&fp, R12);
- PUSH(&fp, R13);
- PUSH(&fp, R14);
- PUSH(&fp, R15);
-
- int i;
- memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init);
-
-//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init);
-//fprintf(stderr, "Constants address = %016p\n", sse_constants);
-//fprintf(stderr, "Constants address = %016p\n", p->constants);
-
-//int32_t val = READ_IMM32(fp + 3);
-//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p));
-
-//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp );
-//fprintf(stderr, "IMM = 0x%llx\n", v2);
-
-//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp ));
- fp += (leaf_ee - leaf_ee_init);
-
-//fprintf(stderr, "Leaf start address = %016p\n", fp);
- align_mem16(&fp, 9);
- memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
-
-
- uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
- uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4};
- uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2};
-
- for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4);
-
- fp += (leaf_oo - leaf_ee);
-
- if(__builtin_ctzl(N) & 1){
-
- if(p->i1) {
- lp_cnt += p->i1 * 4;
- MOVI(&fp, RCX, lp_cnt);
- align_mem16(&fp, 4);
- memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
- for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4);
- fp += (leaf_eo - leaf_oo);
- }
-
-
- memcpy(fp, leaf_oe, leaf_end - leaf_oe);
- lp_cnt += 4;
- for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4);
- fp += (leaf_end - leaf_oe);
-
- }else{
-
-
- memcpy(fp, leaf_eo, leaf_oe - leaf_eo);
- lp_cnt += 4;
- for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4);
- fp += (leaf_oe - leaf_eo);
-
- if(p->i1) {
- lp_cnt += p->i1 * 4;
- MOVI(&fp, RCX, lp_cnt);
- align_mem16(&fp, 4);
- memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
- for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4);
- fp += (leaf_eo - leaf_oo);
- }
-
- }
- if(p->i1) {
- lp_cnt += p->i1 * 4;
- MOVI(&fp, RCX, lp_cnt);
- align_mem16(&fp, 9);
- memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
- for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4);
- fp += (leaf_oo - leaf_ee);
-
- }
-
-//fprintf(stderr, "Body start address = %016p\n", fp);
- //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p));
- memcpy(fp, x_init, x4 - x_init);
-//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp ));
- fp += (x4 - x_init);
-
- int32_t pAddr = 0;
- int32_t pN = 0;
- int32_t pLUT = 0;
- count = 2;
- while(pps[0]) {
-
- if(!pN) {
- MOVI(&fp, RCX, pps[0] / 4);
- }else{
- if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr);
- if(pps[0] > leafN && pps[0] - pN) {
-
- int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN);
- *fp++ = 0xc1;
-
- if(diff > 0) {
- *fp++ = 0xe1;
- *fp++ = (diff & 0xff);
- }else{
- *fp++ = 0xe9;
- *fp++ = ((-diff) & 0xff);
- }
- }
- }
-
- if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
- ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
-
-
- if(pps[0] == 2*leafN) {
- CALL(&fp, x_4_addr);
- // }else if(!pps[2]){
- // //uint32_t *x_8_t_addr = fp;
- // memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
- // fp += (neon_ee - neon_x8_t) / 4;
- // //*fp++ = BL(fp+2, x_8_t_addr);
- }else{
- CALL(&fp, x_8_addr);
- }
-
- pAddr = pps[1] * 4;
- if(pps[0] > leafN)
- pN = pps[0];
- pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
-// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
- count += 4;
- pps += 2;
- }
+ MOVI(&fp, ECX, pps[0] / 4);
#endif
+ } else {
+ int offset = (4 * pps[1]) - pAddr;
+ if (offset) {
+#ifdef _M_AMD64
+ ADDI(&fp, R8, offset);
+#else
+ ADDI(&fp, RDX, offset);
+#endif
+ }
+
+ if (pps[0] > leaf_N && pps[0] - pN) {
+ int factor = ffts_ctzl(pps[0]) - ffts_ctzl(pN);
+
+#ifdef _M_AMD64
+ SHIFT(&fp, EBX, factor);
+#else
+ SHIFT(&fp, ECX, factor);
+#endif
+ }
+ }
+
+ ws_is = 8 * p->ws_is[ffts_ctzl(pps[0] / leaf_N) - 1];
+ if (ws_is != pLUT) {
+ int offset = (int) (ws_is - pLUT);
+
+#ifdef _M_AMD64
+ ADDI(&fp, RDI, offset);
+#else
+ ADDI(&fp, R8, offset);
+#endif
+ }
+
+ if (pps[0] == 2 * leaf_N) {
+ CALL(&fp, x_4_addr);
+ } else {
+ CALL(&fp, x_8_addr);
+ }
+
+ pAddr = 4 * pps[1];
+ if (pps[0] > leaf_N) {
+ pN = pps[0];
+ }
+
+ pLUT = ws_is;//LUT_offset(pps[0], leafN);
+ //fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
+ count += 4;
+ pps += 2;
+ }
+#endif
+
#ifdef __arm__
#ifdef HAVE_NEON
- if(__builtin_ctzl(N) & 1){
- ADDI(&fp, 2, 7, 0);
- ADDI(&fp, 7, 9, 0);
- ADDI(&fp, 9, 2, 0);
-
- ADDI(&fp, 2, 8, 0);
- ADDI(&fp, 8, 10, 0);
- ADDI(&fp, 10, 2, 0);
-
- if(p->i1) {
- MOVI(&fp, 11, p->i1);
- memcpy(fp, neon_oo, neon_eo - neon_oo);
- if(sign < 0) {
- fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
- fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
- fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
- }
- fp += (neon_eo - neon_oo) / 4;
- }
-
- *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++;
-
- memcpy(fp, neon_oe, neon_end - neon_oe);
- if(sign < 0) {
- fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000;
- fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000;
- fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000;
- }
- fp += (neon_end - neon_oe) / 4;
-
- }else{
-
- *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++;
-
- memcpy(fp, neon_eo, neon_oe - neon_eo);
- if(sign < 0) {
- fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000;
- fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000;
- fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000;
- }
- fp += (neon_oe - neon_eo) / 4;
-
- ADDI(&fp, 2, 7, 0);
- ADDI(&fp, 7, 9, 0);
- ADDI(&fp, 9, 2, 0);
-
- ADDI(&fp, 2, 8, 0);
- ADDI(&fp, 8, 10, 0);
- ADDI(&fp, 10, 2, 0);
-
- if(p->i1) {
- MOVI(&fp, 11, p->i1);
- memcpy(fp, neon_oo, neon_eo - neon_oo);
- if(sign < 0) {
- fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
- fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
- fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
- }
- fp += (neon_eo - neon_oo) / 4;
- }
-
- }
-
-
- if(p->i1) {
- ADDI(&fp, 2, 3, 0);
- ADDI(&fp, 3, 7, 0);
- ADDI(&fp, 7, 2, 0);
-
- ADDI(&fp, 2, 4, 0);
- ADDI(&fp, 4, 8, 0);
- ADDI(&fp, 8, 2, 0);
-
- ADDI(&fp, 2, 5, 0);
- ADDI(&fp, 5, 9, 0);
- ADDI(&fp, 9, 2, 0);
-
- ADDI(&fp, 2, 6, 0);
- ADDI(&fp, 6, 10, 0);
- ADDI(&fp, 10, 2, 0);
-
- ADDI(&fp, 2, 9, 0);
- ADDI(&fp, 9, 10, 0);
- ADDI(&fp, 10, 2, 0);
-
- *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
- MOVI(&fp, 11, p->i1);
- memcpy(fp, neon_ee, neon_oo - neon_ee);
- if(sign < 0) {
- fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
- fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
- fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
- }
- fp += (neon_oo - neon_ee) / 4;
-
- }
+ if(__builtin_ctzl(N) & 1) {
+ ADDI(&fp, 2, 7, 0);
+ ADDI(&fp, 7, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 8, 0);
+ ADDI(&fp, 8, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ if(p->i1) {
+ MOVI(&fp, 11, p->i1);
+ memcpy(fp, neon_oo, neon_eo - neon_oo);
+ if(sign < 0) {
+ fp[12] ^= 0x00200000;
+ fp[13] ^= 0x00200000;
+ fp[14] ^= 0x00200000;
+ fp[15] ^= 0x00200000;
+ fp[27] ^= 0x00200000;
+ fp[29] ^= 0x00200000;
+ fp[30] ^= 0x00200000;
+ fp[31] ^= 0x00200000;
+ fp[46] ^= 0x00200000;
+ fp[47] ^= 0x00200000;
+ fp[48] ^= 0x00200000;
+ fp[57] ^= 0x00200000;
+ }
+ fp += (neon_eo - neon_oo) / 4;
+ }
+
+ *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p));
+ fp++;
+
+ memcpy(fp, neon_oe, neon_end - neon_oe);
+ if(sign < 0) {
+ fp[19] ^= 0x00200000;
+ fp[20] ^= 0x00200000;
+ fp[22] ^= 0x00200000;
+ fp[23] ^= 0x00200000;
+ fp[37] ^= 0x00200000;
+ fp[38] ^= 0x00200000;
+ fp[40] ^= 0x00200000;
+ fp[41] ^= 0x00200000;
+ fp[64] ^= 0x00200000;
+ fp[65] ^= 0x00200000;
+ fp[66] ^= 0x00200000;
+ fp[67] ^= 0x00200000;
+ }
+ fp += (neon_end - neon_oe) / 4;
+
+ } else {
+
+ *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p));
+ fp++;
+
+ memcpy(fp, neon_eo, neon_oe - neon_eo);
+ if(sign < 0) {
+ fp[10] ^= 0x00200000;
+ fp[11] ^= 0x00200000;
+ fp[13] ^= 0x00200000;
+ fp[14] ^= 0x00200000;
+ fp[31] ^= 0x00200000;
+ fp[33] ^= 0x00200000;
+ fp[34] ^= 0x00200000;
+ fp[35] ^= 0x00200000;
+ fp[59] ^= 0x00200000;
+ fp[60] ^= 0x00200000;
+ fp[61] ^= 0x00200000;
+ fp[62] ^= 0x00200000;
+ }
+ fp += (neon_oe - neon_eo) / 4;
+
+ ADDI(&fp, 2, 7, 0);
+ ADDI(&fp, 7, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 8, 0);
+ ADDI(&fp, 8, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ if(p->i1) {
+ MOVI(&fp, 11, p->i1);
+ memcpy(fp, neon_oo, neon_eo - neon_oo);
+ if(sign < 0) {
+ fp[12] ^= 0x00200000;
+ fp[13] ^= 0x00200000;
+ fp[14] ^= 0x00200000;
+ fp[15] ^= 0x00200000;
+ fp[27] ^= 0x00200000;
+ fp[29] ^= 0x00200000;
+ fp[30] ^= 0x00200000;
+ fp[31] ^= 0x00200000;
+ fp[46] ^= 0x00200000;
+ fp[47] ^= 0x00200000;
+ fp[48] ^= 0x00200000;
+ fp[57] ^= 0x00200000;
+ }
+ fp += (neon_eo - neon_oo) / 4;
+ }
+
+ }
+
+
+ if(p->i1) {
+ ADDI(&fp, 2, 3, 0);
+ ADDI(&fp, 3, 7, 0);
+ ADDI(&fp, 7, 2, 0);
+
+ ADDI(&fp, 2, 4, 0);
+ ADDI(&fp, 4, 8, 0);
+ ADDI(&fp, 8, 2, 0);
+
+ ADDI(&fp, 2, 5, 0);
+ ADDI(&fp, 5, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 6, 0);
+ ADDI(&fp, 6, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ ADDI(&fp, 2, 9, 0);
+ ADDI(&fp, 9, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
+ fp++;
+ MOVI(&fp, 11, p->i1);
+ memcpy(fp, neon_ee, neon_oo - neon_ee);
+ if(sign < 0) {
+ fp[33] ^= 0x00200000;
+ fp[37] ^= 0x00200000;
+ fp[38] ^= 0x00200000;
+ fp[39] ^= 0x00200000;
+ fp[40] ^= 0x00200000;
+ fp[41] ^= 0x00200000;
+ fp[44] ^= 0x00200000;
+ fp[45] ^= 0x00200000;
+ fp[46] ^= 0x00200000;
+ fp[47] ^= 0x00200000;
+ fp[48] ^= 0x00200000;
+ fp[57] ^= 0x00200000;
+ }
+ fp += (neon_oo - neon_ee) / 4;
+
+ }
#else
- ADDI(&fp, 2, 7, 0);
- ADDI(&fp, 7, 9, 0);
- ADDI(&fp, 9, 2, 0);
-
- ADDI(&fp, 2, 8, 0);
- ADDI(&fp, 8, 10, 0);
- ADDI(&fp, 10, 2, 0);
-
- MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
- memcpy(fp, vfp_o, vfp_x4 - vfp_o);
- if(sign > 0) {
- fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040;
- fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040;
- }
- fp += (vfp_x4 - vfp_o) / 4;
-
- ADDI(&fp, 2, 3, 0);
- ADDI(&fp, 3, 7, 0);
- ADDI(&fp, 7, 2, 0);
-
- ADDI(&fp, 2, 4, 0);
- ADDI(&fp, 4, 8, 0);
- ADDI(&fp, 8, 2, 0);
-
- ADDI(&fp, 2, 5, 0);
- ADDI(&fp, 5, 9, 0);
- ADDI(&fp, 9, 2, 0);
-
- ADDI(&fp, 2, 6, 0);
- ADDI(&fp, 6, 10, 0);
- ADDI(&fp, 10, 2, 0);
-
- ADDI(&fp, 2, 9, 0);
- ADDI(&fp, 9, 10, 0);
- ADDI(&fp, 10, 2, 0);
-
- *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
- MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
- memcpy(fp, vfp_e, vfp_o - vfp_e);
- if(sign > 0) {
- fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
- fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
- fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
- }
- fp += (vfp_o - vfp_e) / 4;
+ ADDI(&fp, 2, 7, 0);
+ ADDI(&fp, 7, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 8, 0);
+ ADDI(&fp, 8, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
+ memcpy(fp, vfp_o, vfp_x4 - vfp_o);
+ if(sign > 0) {
+ fp[22] ^= 0x00000040;
+ fp[24] ^= 0x00000040;
+ fp[25] ^= 0x00000040;
+ fp[26] ^= 0x00000040;
+ fp[62] ^= 0x00000040;
+ fp[64] ^= 0x00000040;
+ fp[65] ^= 0x00000040;
+ fp[66] ^= 0x00000040;
+ }
+ fp += (vfp_x4 - vfp_o) / 4;
+
+ ADDI(&fp, 2, 3, 0);
+ ADDI(&fp, 3, 7, 0);
+ ADDI(&fp, 7, 2, 0);
+
+ ADDI(&fp, 2, 4, 0);
+ ADDI(&fp, 4, 8, 0);
+ ADDI(&fp, 8, 2, 0);
+
+ ADDI(&fp, 2, 5, 0);
+ ADDI(&fp, 5, 9, 0);
+ ADDI(&fp, 9, 2, 0);
+
+ ADDI(&fp, 2, 6, 0);
+ ADDI(&fp, 6, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ ADDI(&fp, 2, 9, 0);
+ ADDI(&fp, 9, 10, 0);
+ ADDI(&fp, 10, 2, 0);
+
+ *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
+ fp++;
+ MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
+ memcpy(fp, vfp_e, vfp_o - vfp_e);
+ if(sign > 0) {
+ fp[64] ^= 0x00000040;
+ fp[65] ^= 0x00000040;
+ fp[68] ^= 0x00000040;
+ fp[75] ^= 0x00000040;
+ fp[76] ^= 0x00000040;
+ fp[79] ^= 0x00000040;
+ fp[80] ^= 0x00000040;
+ fp[83] ^= 0x00000040;
+ fp[84] ^= 0x00000040;
+ fp[87] ^= 0x00000040;
+ fp[91] ^= 0x00000040;
+ fp[93] ^= 0x00000040;
+ }
+ fp += (vfp_o - vfp_e) / 4;
#endif
- *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
- //ADDI(&fp, 2, 1, 0);
- MOVI(&fp, 1, 0);
-
- // args: r0 - out
- // r1 - N
- // r2 - ws
-// ADDI(&fp, 3, 1, 0); // put N into r3 for counter
-
- int32_t pAddr = 0;
- int32_t pN = 0;
- int32_t pLUT = 0;
- count = 2;
- while(pps[0]) {
-
-// fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
- if(!pN) {
- MOVI(&fp, 1, pps[0]);
- }else{
- if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
- if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
- }
-
- if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
- ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
-
-
- if(pps[0] == 2*leafN) {
- *fp = BL(fp+2, x_4_addr); fp++;
- }else if(!pps[2]){
- //uint32_t *x_8_t_addr = fp;
+ *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p));
+ fp++; // load offsets into r12
+ //ADDI(&fp, 2, 1, 0);
+ MOVI(&fp, 1, 0);
+
+ // args: r0 - out
+ // r1 - N
+ // r2 - ws
+ // ADDI(&fp, 3, 1, 0); // put N into r3 for counter
+
+ int32_t pAddr = 0;
+ int32_t pN = 0;
+ int32_t pLUT = 0;
+ count = 2;
+ while(pps[0]) {
+
+ // fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
+ if(!pN) {
+ MOVI(&fp, 1, pps[0]);
+ } else {
+ if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
+ if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
+ }
+
+ if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
+ ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
+
+
+ if(pps[0] == 2*leafN) {
+ *fp = BL(fp+2, x_4_addr);
+ fp++;
+ } else if(!pps[2]) {
+ //uint32_t *x_8_t_addr = fp;
#ifdef HAVE_NEON
- memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
- if(sign < 0) {
- fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
- fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
- fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
- }
- fp += (neon_ee - neon_x8_t) / 4;
- //*fp++ = BL(fp+2, x_8_t_addr);
+ memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
+ if(sign < 0) {
+ fp[31] ^= 0x00200000;
+ fp[32] ^= 0x00200000;
+ fp[33] ^= 0x00200000;
+ fp[34] ^= 0x00200000;
+ fp[65] ^= 0x00200000;
+ fp[66] ^= 0x00200000;
+ fp[70] ^= 0x00200000;
+ fp[74] ^= 0x00200000;
+ fp[97] ^= 0x00200000;
+ fp[98] ^= 0x00200000;
+ fp[102] ^= 0x00200000;
+ fp[104] ^= 0x00200000;
+ }
+ fp += (neon_ee - neon_x8_t) / 4;
+ //*fp++ = BL(fp+2, x_8_t_addr);
#else
- *fp = BL(fp+2, x_8_addr); fp++;
+ *fp = BL(fp+2, x_8_addr);
+ fp++;
#endif
- }else{
- *fp = BL(fp+2, x_8_addr); fp++;
- }
-
- pAddr = pps[1] * 4;
- pN = pps[0];
- pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
-// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
- count += 4;
- pps += 2;
- }
-
- *fp++ = 0xecbd8b10;
- *fp++ = POP_LR(); count++;
+ } else {
+ *fp = BL(fp+2, x_8_addr);
+ fp++;
+ }
+
+ pAddr = pps[1] * 4;
+ pN = pps[0];
+ pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
+ // fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
+ count += 4;
+ pps += 2;
+ }
+
+ *fp++ = 0xecbd8b10;
+ *fp++ = POP_LR();
+ count++;
#else
- POP(&fp, R15);
- POP(&fp, R14);
- POP(&fp, R13);
- POP(&fp, R12);
- POP(&fp, R11);
- POP(&fp, R10);
- POP(&fp, RBX);
- POP(&fp, RBP);
- RET(&fp);
-
-//uint8_t *pp = func;
-//int counter = 0;
-//do{
-// printf("%02x ", *pp);
-// if(counter++ % 16 == 15) printf("\n");
-//} while(++pp < fp);
+ /* restore nonvolatile registers */
+#ifdef _M_AMD64
+ /* mov rbx, [rsp + 8] */
+ *fp++ = 0x48;
+ *fp++ = 0x8B;
+ *fp++ = 0x5C;
+ *fp++ = 0x24;
+ *fp++ = 0x08;
+
+ /* mov rsi, [rsp + 16] */
+ *fp++ = 0x48;
+ *fp++ = 0x8B;
+ *fp++ = 0x74;
+ *fp++ = 0x24;
+ *fp++ = 0x10;
+
+ /* mov rdi, [rsp + 24] */
+ *fp++ = 0x48;
+ *fp++ = 0x8B;
+ *fp++ = 0x7C;
+ *fp++ = 0x24;
+ *fp++ = 0x18;
+#else
+ POP(&fp, R15);
+ POP(&fp, R14);
+ POP(&fp, R13);
+ POP(&fp, R12);
+ POP(&fp, R11);
+ POP(&fp, R10);
+ POP(&fp, RBX);
+ POP(&fp, RBP);
+#endif
-//printf("\n");
+ RET(&fp);
+ //uint8_t *pp = func;
+ //int counter = 0;
+ //do{
+ // printf("%02x ", *pp);
+ // if(counter++ % 16 == 15) printf("\n");
+ //} while(++pp < fp);
+ //printf("\n");
#endif
+ // *fp++ = B(14); count++;
-// *fp++ = B(14); count++;
-
-//for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
-// fprintf(stderr, "%08x\n", x_4_addr[i]);
-//fprintf(stderr, "\n");
-//for(int i=0;i<count;i++)
-
- free(ps);
-
- if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) {
- perror("Couldn't mprotect");
- exit(1);
- }
-#ifdef __APPLE__
- sys_icache_invalidate(func, p->transform_size);
-#elif __ANDROID__
- cacheflush((long)(func), (long)(func) + p->transform_size, 0);
-#elif __linux__
-#ifdef __GNUC__
- __clear_cache((long)(func), (long)(func) + p->transform_size);
-#endif
-#endif
+ //for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
+ // fprintf(stderr, "%08x\n", x_4_addr[i]);
+ //fprintf(stderr, "\n");
+ //for(int i=0;i<count;i++)
-//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4);
+ fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp));
- p->transform = (void *) (start);
-}
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+ free(ps);
+
+ return (transform_func_t) start;
+} \ No newline at end of file
diff --git a/src/codegen.h b/src/codegen.h
index c07144f..e3c2381 100644
--- a/src/codegen.h
+++ b/src/codegen.h
@@ -1,10 +1,10 @@
/*
-
+
This file is part of FFTS -- The Fastest Fourier Transform in the South
-
+
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
-
+ Copyright (c) 2012, The University of Waikato
+
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -31,20 +31,15 @@
*/
-#ifndef __CODEGEN_H__
-#define __CODEGEN_H__
+#ifndef FFTS_CODEGEN_H
+#define FFTS_CODEGEN_H
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <sys/mman.h>
-#include <string.h>
-#include <limits.h> /* for PAGESIZE */
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
#include "ffts.h"
-void ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN, int sign);
+transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign);
-#endif
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+#endif /* FFTS_CODEGEN_H */
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
index 6a38671..a63d21d 100644
--- a/src/codegen_sse.h
+++ b/src/codegen_sse.h
@@ -1,10 +1,10 @@
/*
-
+
This file is part of FFTS -- The Fastest Fourier Transform in the South
-
+
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
-
+ Copyright (c) 2012, The University of Waikato
+
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -32,8 +32,8 @@
*/
-#ifndef __CODEGEN_SSE_H__
-#define __CODEGEN_SSE_H__
+#ifndef FFTS_CODEGEN_SSE_H
+#define FFTS_CODEGEN_SSE_H
void neon_x4(float *, size_t, float *);
void neon_x8(float *, size_t, float *);
@@ -47,7 +47,7 @@ void leaf_end();
void x_init();
void x4();
void x8_soft();
-void x8_hard();
+void x8_soft_end();
void sse_constants();
void sse_constants_inv();
@@ -74,123 +74,157 @@ extern const uint32_t sse_leaf_oe_offsets[8];
#define RSI 6
#define RDI 7
#define RBP 5
-#define R8 8
-#define R9 9
-#define R10 10
-#define R11 11
-#define R12 12
-#define R13 13
-#define R14 14
-#define R15 15
-
-void IMM8(uint8_t **p, int32_t imm) {
- *(*p)++ = (imm & 0xff);
+#define R8 8
+#define R9 9
+#define R10 10
+#define R11 11
+#define R12 12
+#define R13 13
+#define R14 14
+#define R15 15
+
+void IMM8(uint8_t **p, int32_t imm)
+{
+ *(*p)++ = (imm & 0xff);
}
-void IMM16(uint8_t **p, int32_t imm) {
- int i;
- for(i=0;i<2;i++) {
- *(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
- }
+void IMM16(uint8_t **p, int32_t imm)
+{
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ *(*p)++ = (imm & (0xff << (8 * i))) >> (8 * i);
+ }
}
-void IMM32(uint8_t **p, int32_t imm) {
- int i;
- for(i=0;i<4;i++) {
- *(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
- }
+
+void IMM32(uint8_t **p, int32_t imm)
+{
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ *(*p)++ = (imm & (0xff << (8 * i))) >> (8 * i);
+ }
}
-void IMM32_NI(uint8_t *p, int32_t imm) {
- int i;
- for(i=0;i<4;i++) {
- *(p+i) = (imm & (0xff << (i*8))) >> (i*8);
- }
+
+void IMM32_NI(uint8_t *p, int32_t imm)
+{
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ *(p+i) = (imm & (0xff << (8 * i))) >> (8 * i);
+ }
}
-int32_t READ_IMM32(uint8_t *p) {
- int32_t rval = 0;
- int i;
- for(i=0;i<4;i++) {
- rval |= *(p+i) << (i*8);
- }
- return rval;
+int32_t READ_IMM32(uint8_t *p)
+{
+ int32_t rval = 0;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ rval |= *(p+i) << (8 * i);
+ }
+
+ return rval;
}
-void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) {
-// if(imm < 65536) *(*p)++ = 0x66;
- if(dst >= 8) *(*p)++ = 0x41;
-
- //if(imm < 65536 && imm >= 256) *(*p)++ = 0x66;
-
- //if(imm >= 256)
- *(*p)++ = 0xb8 | (dst & 0x7);
-// else *(*p)++ = 0xb0 | (dst & 0x7);
-
- // if(imm < 256) IMM8(p, imm);
-// else
-//if(imm < 65536) IMM16(p, imm);
-//else
- IMM32(p, imm);
-
-//if(dst < 8) {
-// *(*p)++ = 0xb8 + dst;
-//}else{
-// *(*p)++ = 0x49;
-// *(*p)++ = 0xc7;
-// *(*p)++ = 0xc0 | (dst - 8);
-//}
-//IMM32(p, imm);
+void MOVI(uint8_t **p, uint8_t dst, uint32_t imm)
+{
+ if (dst >= 8) {
+ *(*p)++ = 0x41;
+ }
+
+ *(*p)++ = 0xb8 | (dst & 0x7);
+ IMM32(p, imm);
}
-void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) {
- if(disp == 0) {
- *(*p)++ = (rm & 7) | ((reg & 7) << 3);
- }else if(disp <= 127 || disp >= -128) {
- *(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3);
- IMM8(p, disp);
- }else{
- *(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3);
- IMM32(p, disp);
- }
+void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp)
+{
+ if (disp == 0) {
+ *(*p)++ = (rm & 7) | ((reg & 7) << 3);
+ } else if (disp <= 127 || disp >= -128) {
+ *(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3);
+ IMM8(p, disp);
+ } else {
+ *(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3);
+ IMM32(p, disp);
+ }
}
-void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp) {
+void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp)
+{
+ *(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1);
+ *(*p)++ = 0x8d;
+ ADDRMODE(p, dst, base, disp);
+}
- *(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1);
- *(*p)++ = 0x8d;
- ADDRMODE(p, dst, base, disp);
+void RET(uint8_t **p)
+{
+ *(*p)++ = 0xc3;
}
-void RET(uint8_t **p) {
- *(*p)++ = 0xc3;
+void ADDI(uint8_t **p, uint8_t dst, int32_t imm)
+{
+ if (dst >= 8) {
+ *(*p)++ = 0x49;
+ } else {
+ *(*p)++ = 0x48;
+ }
+
+ if (imm > 127 || imm <= -128) {
+ *(*p)++ = 0x81;
+ } else {
+ *(*p)++ = 0x83;
+ }
+
+ *(*p)++ = 0xc0 | (dst & 0x7);
+
+ if (imm > 127 || imm <= -128) {
+ IMM32(p, imm);
+ } else {
+ IMM8(p, imm);
+ }
}
-void ADDI(uint8_t **p, uint8_t dst, int32_t imm) {
-
- if(dst >= 8) *(*p)++ = 0x49;
- else *(*p)++ = 0x48;
-
- if(imm > 127 || imm <= -128) *(*p)++ = 0x81;
- else *(*p)++ = 0x83;
-
- *(*p)++ = 0xc0 | (dst & 0x7);
-
- if(imm > 127 || imm <= -128) IMM32(p, imm);
- else IMM8(p, imm);
+void CALL(uint8_t **p, uint8_t *func)
+{
+ *(*p)++ = 0xe8;
+ IMM32(p, func - *p - 4);
}
-void CALL(uint8_t **p, uint8_t *func) {
- *(*p)++ = 0xe8;
- IMM32(p, ((void *)func) - (void *)(*p) - 4);
+void PUSH(uint8_t **p, uint8_t reg)
+{
+ if (reg >= 8) {
+ *(*p)++ = 0x41;
+ }
+
+ *(*p)++ = 0x50 | (reg & 7);
}
-void PUSH(uint8_t **p, uint8_t reg) {
- if(reg >= 8) *(*p)++ = 0x41;
- *(*p)++ = 0x50 | (reg & 7);
+void POP(uint8_t **p, uint8_t reg)
+{
+ if (reg >= 8) {
+ *(*p)++ = 0x41;
+ }
+
+ *(*p)++ = 0x58 | (reg & 7);
}
-void POP(uint8_t **p, uint8_t reg) {
- if(reg >= 8) *(*p)++ = 0x41;
- *(*p)++ = 0x58 | (reg & 7);
+
+void SHIFT(uint8_t **p, uint8_t reg, int shift)
+{
+ if (reg >= 8) {
+ *(*p)++ = 0x49;
+ }
+
+
+ *(*p)++ = 0xc1;
+ if (shift > 0) {
+ *(*p)++ = 0xe0 | (reg & 7);
+ *(*p)++ = (shift & 0xff);
+ } else {
+ *(*p)++ = 0xe8 | (reg & 7);
+ *(*p)++ = ((-shift) & 0xff);
+ }
}
-#endif
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+
+#endif /* FFTS_CODEGEN_SSE_H */ \ No newline at end of file
diff --git a/src/ffts.c b/src/ffts.c
index b413c2b..9ceb97f 100644
--- a/src/ffts.c
+++ b/src/ffts.c
@@ -1,418 +1,618 @@
/*
-
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
-
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the organization nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+
#include "ffts.h"
#include "macros.h"
-//#include "mini_macros.h"
#include "patterns.h"
#include "ffts_small.h"
#ifdef DYNAMIC_DISABLED
- #include "ffts_static.h"
+#include "ffts_static.h"
#else
- #include "codegen.h"
+#include "codegen.h"
#endif
-#include <errno.h>
- #include <sys/mman.h>
- #include <string.h>
- #include <limits.h> /* for PAGESIZE */
-
#if __APPLE__
- #include <libkern/OSCacheControl.h>
+#include <libkern/OSCacheControl.h>
+#endif
+
+#if _WIN32
+#include <windows.h>
#else
+#include <sys/mman.h>
+#endif
+
+#ifdef __arm__
+static const FFTS_ALIGN(64) float w_data[16] = {
+ 0.70710678118654757273731092936941f,
+ 0.70710678118654746171500846685376f,
+ -0.70710678118654757273731092936941f,
+ -0.70710678118654746171500846685376f,
+ 1.0f,
+ 0.70710678118654757273731092936941f,
+ -0.0f,
+ -0.70710678118654746171500846685376f,
+ 0.70710678118654757273731092936941f,
+ 0.70710678118654746171500846685376f,
+ 0.70710678118654757273731092936941f,
+ 0.70710678118654746171500846685376f,
+ 1.0f,
+ 0.70710678118654757273731092936941f,
+ 0.0f,
+ 0.70710678118654746171500846685376f
+};
#endif
-void ffts_execute(ffts_plan_t *p, const void * in, void * out) {
+static FFTS_INLINE int ffts_allow_execute(void *start, size_t len)
+{
+ int result;
-//TODO: Define NEEDS_ALIGNED properly instead
-#if defined(HAVE_SSE) || defined(HAVE_NEON)
- if(((int)in % 16) != 0) {
- LOG("ffts_execute: input buffer needs to be aligned to a 128bit boundary\n");
- }
+#ifdef _WIN32
+ DWORD old_protect;
+ result = !VirtualProtect(start, len, PAGE_EXECUTE_READ, &old_protect);
+#else
+ result = mprotect(start, len, PROT_READ | PROT_EXEC);
+#endif
+
+ return result;
+}
+
+static FFTS_INLINE int ffts_deny_execute(void *start, size_t len)
+{
+ int result;
+
+#ifdef _WIN32
+ DWORD old_protect;
+ result = (int) VirtualProtect(start, len, PAGE_READWRITE, &old_protect);
+#else
+ result = mprotect(start, len, PROT_READ | PROT_WRITE);
+#endif
+
+ return result;
+}
+
+static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length)
+{
+#ifdef __APPLE__
+ sys_icache_invalidate(start, length);
+#elif __ANDROID__
+ cacheflush((long) start, (long) start + length, 0);
+#elif __linux__
+#if GCC_VERSION_AT_LEAST(4,3)
+ __builtin___clear_cache(start, (char*) start + length);
+#elif __GNUC__
+ __clear_cache((long) start, (long) start + length);
+#endif
+#elif _WIN32
+ return !FlushInstructionCache(GetCurrentProcess(), start, length);
+#endif
+ return 0;
+}
- if(((int)out % 16) != 0) {
- LOG("ffts_execute: output buffer needs to be aligned to a 128bit boundary\n");
- }
+static FFTS_INLINE void *ffts_vmem_alloc(size_t length)
+{
+#if __APPLE__
+ return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
+#elif _WIN32
+ return VirtualAlloc(NULL, length, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
+#else
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS 0x20
#endif
- p->transform(p, (const float *)in, (float *)out);
+ return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+#endif
}
-void ffts_free(ffts_plan_t *p) {
- p->destroy(p);
+static FFTS_INLINE void ffts_vmem_free(void *addr, size_t length)
+{
+#ifdef _WIN32
+ (void) length;
+ VirtualFree(addr, 0, MEM_RELEASE);
+#else
+ munmap(addr, length);
+#endif
}
-void ffts_free_1d(ffts_plan_t *p) {
-
- size_t i;
+void ffts_execute(ffts_plan_t *p, const void *in, void *out)
+{
+ /* TODO: Define NEEDS_ALIGNED properly instead */
+#if defined(HAVE_SSE) || defined(HAVE_NEON)
+ if (((int) in % 16) != 0) {
+ LOG("ffts_execute: input buffer needs to be aligned to a 128bit boundary\n");
+ }
+
+ if (((int) out % 16) != 0) {
+ LOG("ffts_execute: output buffer needs to be aligned to a 128bit boundary\n");
+ }
+#endif
+
+ p->transform(p, (const float*) in, (float*) out);
+}
- if(p->ws) {
- FFTS_FREE(p->ws);
- }
- if(p->is) free(p->is);
- if(p->ws_is) free(p->ws_is);
- if(p->offsets) free(p->offsets);
- //free(p->transforms);
- if(p->transforms) free(p->transforms);
+void ffts_free(ffts_plan_t *p)
+{
+ if (p) {
+ p->destroy(p);
+ }
+}
+void ffts_free_1d(ffts_plan_t *p)
+{
#if !defined(DYNAMIC_DISABLED)
- if(p->transform_base) {
- if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) {
- perror("Couldn't mprotect");
- exit(errno);
- }
- munmap(p->transform_base, p->transform_size);
- //free(p->transform_base);
- }
+ if (p->transform_base) {
+ ffts_deny_execute(p->transform_base, p->transform_size);
+ ffts_vmem_free(p->transform_base, p->transform_size);
+ }
#endif
- free(p);
+
+ if (p->ws_is) {
+ free(p->ws_is);
+ }
+
+ if (p->ws) {
+ FFTS_FREE(p->ws);
+ }
+
+ if (p->transforms) {
+ free(p->transforms);
+ }
+
+ if (p->is) {
+ free(p->is);
+ }
+
+ if (p->offsets) {
+ free(p->offsets);
+ }
+
+ free(p);
}
-ffts_plan_t *ffts_init_1d(size_t N, int sign) {
- if(N == 0 || (N & (N - 1)) != 0){
- LOG("FFT size must be a power of two\n");
- return NULL;
- }
+static int ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
+{
+ V MULI_SIGN;
+ int hardcoded;
+ size_t lut_size;
+ size_t n_luts;
+ float *tmp;
+ cdata_t *w;
+ size_t i;
+ int n;
+
+#ifdef __arm__
+ /* #ifdef HAVE_NEON */
+ V MULI_SIGN;
+
+ if(sign < 0) {
+ MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+ } else {
+ MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+ }
+
+ /* #endif */
+#else
+ if (sign < 0) {
+ MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+ } else {
+ MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+ }
+#endif
+
+ /* LUTS */
+ n_luts = ffts_ctzl(N / leaf_N);
+ if (N < 32) {
+ n_luts = ffts_ctzl(N / 4);
+ hardcoded = 1;
+ } else {
+ hardcoded = 0;
+ }
+
+ if (n_luts >= 32) {
+ n_luts = 0;
+ }
- ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
- size_t leafN = 8;
- size_t i;
+ /* fprintf(stderr, "n_luts = %zu\n", n_luts); */
+ n = leaf_N * 2;
+ if (hardcoded) {
+ n = 8;
+ }
+
+ lut_size = 0;
+
+ for (i = 0; i < n_luts; i++) {
+ if (!i || hardcoded) {
+#ifdef __arm__
+ if (N <= 32) {
+ lut_size += n/4 * 2 * sizeof(cdata_t);
+ } else {
+ lut_size += n/4 * sizeof(cdata_t);
+ }
+#else
+ lut_size += n/4 * 2 * sizeof(cdata_t);
+#endif
+ n *= 2;
+ } else {
#ifdef __arm__
-//#ifdef HAVE_NEON
- V MULI_SIGN;
-
- if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
- else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
-//#endif
+ lut_size += n/8 * 3 * sizeof(cdata_t);
#else
- V MULI_SIGN;
-
- if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
- else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+ lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
+#endif
+ }
+ n *= 2;
+ }
+
+ /* lut_size *= 16; */
+
+ /* fprintf(stderr, "lut size = %zu\n", lut_size); */
+
+ if (n_luts) {
+ p->ws = FFTS_MALLOC(lut_size, 32);
+ if (!p->ws) {
+ goto cleanup;
+ }
+
+ p->ws_is = malloc(n_luts * sizeof(*p->ws_is));
+ if (!p->ws_is) {
+ goto cleanup;
+ }
+ }
+
+ w = p->ws;
+
+ n = leaf_N * 2;
+ if (hardcoded) {
+ n = 8;
+ }
+
+#ifdef HAVE_NEON
+ V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
#endif
- p->transform = NULL;
- p->transform_base = NULL;
- p->transforms = NULL;
- p->is = NULL;
- p->ws_is = NULL;
- p->ws = NULL;
- p->offsets = NULL;
- p->destroy = ffts_free_1d;
-
- if(N >= 32) {
- ffts_init_offsets(p, N, leafN);
+ for (i = 0; i < n_luts; i++) {
+ p->ws_is[i] = w - (cdata_t *)p->ws;
+ //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);
+
+ if(!i || hardcoded) {
+ cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
+ float *fw0 = (float*) w0;
+ float *fw = (float *)w;
+
+ size_t j;
+ for (j = 0; j < n/4; j++) {
+ w0[j][0] = W_re(n,j);
+ w0[j][1] = W_im(n,j);
+ }
+
#ifdef __arm__
+ if (N < 32) {
+ // w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
+ float *fw = (float *)w;
+ V temp0, temp1, temp2;
+ for (j=0; j<n/4; j+=2) {
+ // #ifdef HAVE_NEON
+ temp0 = VLD(fw0 + j*2);
+ V re, im;
+ re = VDUPRE(temp0);
+ im = VDUPIM(temp0);
+#ifdef HAVE_NEON
+ im = VXOR(im, MULI_SIGN);
+ //im = IMULI(sign>0, im);
+#else
+ im = MULI(sign>0, im);
+#endif
+ VST(fw + j*4 , re);
+ VST(fw + j*4+4, im);
+ // #endif
+ }
+ w += n/4 * 2;
+ } else {
+ //w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
+ float *fw = (float *)w;
#ifdef HAVE_NEON
- ffts_init_is(p, N, leafN, 1);
+ VS temp0, temp1, temp2;
+ for (j=0; j<n/4; j+=4) {
+ temp0 = VLD2(fw0 + j*2);
+ temp0.val[1] = VXOR(temp0.val[1], neg);
+ STORESPR(fw + j*2, temp0);
+ }
#else
- ffts_init_is(p, N, leafN, 1);
+ for (j=0; j<n/4; j+=1) {
+ fw[j*2] = fw0[j*2];
+ fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
+ }
#endif
+ w += n/4;
+ }
#else
- ffts_init_is(p, N, leafN, 1);
+ //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
+ for (j = 0; j < n/4; j += 2) {
+ V re, im, temp0;
+ temp0 = VLD(fw0 + j*2);
+ re = VDUPRE(temp0);
+ im = VDUPIM(temp0);
+ im = VXOR(im, MULI_SIGN);
+ VST(fw + j*4 + 0, re);
+ VST(fw + j*4 + 4, im);
+ }
+
+ w += n/4 * 2;
#endif
-
- p->i0 = N/leafN/3+1;
- p->i1 = N/leafN/3;
- if((N/leafN) % 3 > 1) p->i1++;
- p->i2 = N/leafN/3;
-
- #ifdef __arm__
- #ifdef HAVE_NEON
- p->i0/=2;
- p->i1/=2;
- #endif
- #else
- p->i0/=2;
- p->i1/=2;
- #endif
-
- }else{
- p->transforms = malloc(2 * sizeof(transform_index_t));
- p->transforms[0] = 0;
- p->transforms[1] = 1;
- if(N == 2) p->transform = &firstpass_2;
- else if(N == 4 && sign == -1) p->transform = &firstpass_4_f;
- else if(N == 4 && sign == 1) p->transform = &firstpass_4_b;
- else if(N == 8 && sign == -1) p->transform = &firstpass_8_f;
- else if(N == 8 && sign == 1) p->transform = &firstpass_8_b;
- else if(N == 16 && sign == -1) p->transform = &firstpass_16_f;
- else if(N == 16 && sign == 1) p->transform = &firstpass_16_b;
-
- p->is = NULL;
- p->offsets = NULL;
- }
-
- int hardcoded = 0;
-
- /* LUTS */
- size_t n_luts = __builtin_ctzl(N/leafN);
- if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
-
- if(n_luts >= 32) n_luts = 0;
-
-// fprintf(stderr, "n_luts = %zu\n", n_luts);
-
- cdata_t *w;
-
- int n = leafN*2;
- if(hardcoded) n = 8;
-
- size_t lut_size = 0;
-
- for(i=0;i<n_luts;i++) {
- if(!i || hardcoded) {
- #ifdef __arm__
- if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
- else lut_size += n/4 * sizeof(cdata_t);
- #else
- lut_size += n/4 * 2 * sizeof(cdata_t);
- #endif
- n *= 2;
- } else {
- #ifdef __arm__
- lut_size += n/8 * 3 * sizeof(cdata_t);
- #else
- lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
- #endif
- }
- n *= 2;
- }
-
-// lut_size *= 16;
-
- // fprintf(stderr, "lut size = %zu\n", lut_size);
- if(n_luts) {
- p->ws = FFTS_MALLOC(lut_size,32);
- p->ws_is = malloc(n_luts * sizeof(size_t));
- }else{
- p->ws = NULL;
- p->ws_is = NULL;
- }
- w = p->ws;
-
- n = leafN*2;
- if(hardcoded) n = 8;
-
- #ifdef HAVE_NEON
- V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
- #endif
-
- for(i=0;i<n_luts;i++) {
- p->ws_is[i] = w - (cdata_t *)p->ws;
- //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);
-
- if(!i || hardcoded) {
- cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
-
- size_t j;
- for(j=0;j<n/4;j++) {
- w0[j][0] = W_re(n,j);
- w0[j][1] = W_im(n,j);
- }
-
-
- float *fw0 = (float *)w0;
- #ifdef __arm__
- if(N < 32) {
- //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
- float *fw = (float *)w;
- V temp0, temp1, temp2;
- for(j=0;j<n/4;j+=2) {
- // #ifdef HAVE_NEON
- temp0 = VLD(fw0 + j*2);
- V re, im;
- re = VDUPRE(temp0);
- im = VDUPIM(temp0);
- #ifdef HAVE_NEON
- im = VXOR(im, MULI_SIGN);
- //im = IMULI(sign>0, im);
- #else
- im = MULI(sign>0, im);
- #endif
- VST(fw + j*4 , re);
- VST(fw + j*4+4, im);
- // #endif
- }
- w += n/4 * 2;
- }else{
- //w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
- float *fw = (float *)w;
- #ifdef HAVE_NEON
- VS temp0, temp1, temp2;
- for(j=0;j<n/4;j+=4) {
- temp0 = VLD2(fw0 + j*2);
- temp0.val[1] = VXOR(temp0.val[1], neg);
- STORESPR(fw + j*2, temp0);
- }
- #else
- for(j=0;j<n/4;j+=1) {
- fw[j*2] = fw0[j*2];
- fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
- }
- #endif
- w += n/4;
- }
- #else
- //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
- float *fw = (float *)w;
- V temp0, temp1, temp2;
- for(j=0;j<n/4;j+=2) {
- temp0 = VLD(fw0 + j*2);
- V re, im;
- re = VDUPRE(temp0);
- im = VDUPIM(temp0);
- im = VXOR(im, MULI_SIGN);
- VST(fw + j*4 , re);
- VST(fw + j*4+4, im);
- }
- w += n/4 * 2;
- #endif
-
- FFTS_FREE(w0);
- }else{
-
- cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
- cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
- cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
-
- size_t j;
- for(j=0;j<n/8;j++) {
- w0[j][0] = W_re(n,j*2);
- w0[j][1] = W_im(n,j*2);
- w1[j][0] = W_re(n,j);
- w1[j][1] = W_im(n,j);
- w2[j][0] = W_re(n,j + (n/8));
- w2[j][1] = W_im(n,j + (n/8));
-
- }
-
- float *fw0 = (float *)w0;
- float *fw1 = (float *)w1;
- float *fw2 = (float *)w2;
- #ifdef __arm__
- //w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
- float *fw = (float *)w;
- #ifdef HAVE_NEON
- VS temp0, temp1, temp2;
- for(j=0;j<n/8;j+=4) {
- temp0 = VLD2(fw0 + j*2);
- temp0.val[1] = VXOR(temp0.val[1], neg);
- STORESPR(fw + j*2*3, temp0);
- temp1 = VLD2(fw1 + j*2);
- temp1.val[1] = VXOR(temp1.val[1], neg);
- STORESPR(fw + j*2*3 + 8, temp1);
- temp2 = VLD2(fw2 + j*2);
- temp2.val[1] = VXOR(temp2.val[1], neg);
- STORESPR(fw + j*2*3 + 16, temp2);
- }
- #else
- for(j=0;j<n/8;j+=1) {
- fw[j*6] = fw0[j*2];
- fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
- fw[j*6+2] = fw1[j*2+0];
- fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1];
- fw[j*6+4] = fw2[j*2+0];
- fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1];
- }
- #endif
- w += n/8 * 3;
- #else
- //w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
- float *fw = (float *)w;
- V temp0, temp1, temp2, re, im;
- for(j=0;j<n/8;j+=2) {
- temp0 = VLD(fw0 + j*2);
- re = VDUPRE(temp0);
- im = VDUPIM(temp0);
- im = VXOR(im, MULI_SIGN);
- VST(fw + j*2*6 , re);
- VST(fw + j*2*6+4, im);
-
- temp1 = VLD(fw1 + j*2);
- re = VDUPRE(temp1);
- im = VDUPIM(temp1);
- im = VXOR(im, MULI_SIGN);
- VST(fw + j*2*6+8 , re);
- VST(fw + j*2*6+12, im);
-
- temp2 = VLD(fw2 + j*2);
- re = VDUPRE(temp2);
- im = VDUPIM(temp2);
- im = VXOR(im, MULI_SIGN);
- VST(fw + j*2*6+16, re);
- VST(fw + j*2*6+20, im);
- }
- w += n/8 * 3 * 2;
- #endif
-
- FFTS_FREE(w0);
- FFTS_FREE(w1);
- FFTS_FREE(w2);
- }
- ///p->ws[i] = w;
-
- n *= 2;
- }
-
- float *tmp = (float *)p->ws;
-
- if(sign < 0) {
- p->oe_ws = (void *)(&w_data[4]);
- p->ee_ws = (void *)(w_data);
- p->eo_ws = (void *)(&w_data[4]);
- }else{
- p->oe_ws = (void *)(w_data + 12);
- p->ee_ws = (void *)(w_data + 8);
- p->eo_ws = (void *)(w_data + 12);
- }
-
- p->N = N;
- p->lastlut = w;
- p->n_luts = n_luts;
-#ifdef DYNAMIC_DISABLED
- if(sign < 0) {
- if(N >= 32) p->transform = ffts_static_transform_f;
- }else{
- if(N >= 32) p->transform = ffts_static_transform_i;
- }
+ FFTS_FREE(w0);
+ } else {
+ cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+ cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+ cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
+
+ float *fw0 = (float*) w0;
+ float *fw1 = (float*) w1;
+ float *fw2 = (float*) w2;
+
+ float *fw = (float *)w;
+ V temp0, temp1, temp2, re, im;
+
+ size_t j;
+ for (j = 0; j < n/8; j++) {
+ w0[j][0] = W_re((float) n, (float) 2*j);
+ w0[j][1] = W_im((float) n, (float) 2*j);
+ w1[j][0] = W_re((float) n, (float) j);
+ w1[j][1] = W_im((float) n, (float) j);
+ w2[j][0] = W_re((float) n, (float) (j + (n/8)));
+ w2[j][1] = W_im((float) n, (float) (j + (n/8)));
+ }
+
+#ifdef __arm__
+ //w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
+ float *fw = (float *)w;
+#ifdef HAVE_NEON
+ VS temp0, temp1, temp2;
+ for (j = 0; j < n/8; j += 4) {
+ temp0 = VLD2(fw0 + j*2);
+ temp0.val[1] = VXOR(temp0.val[1], neg);
+ STORESPR(fw + j*2*3, temp0);
+ temp1 = VLD2(fw1 + j*2);
+ temp1.val[1] = VXOR(temp1.val[1], neg);
+ STORESPR(fw + j*2*3 + 8, temp1);
+ temp2 = VLD2(fw2 + j*2);
+ temp2.val[1] = VXOR(temp2.val[1], neg);
+ STORESPR(fw + j*2*3 + 16, temp2);
+ }
#else
- if(N>=32) ffts_generate_func_code(p, N, leafN, sign);
+ for (j = 0; j < n/8; j += 1) {
+ fw[j*6] = fw0[j*2];
+ fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
+ fw[j*6+2] = fw1[j*2+0];
+ fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1];
+ fw[j*6+4] = fw2[j*2+0];
+ fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1];
+ }
#endif
+ w += n/8 * 3;
+#else
+ //w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
+ for (j = 0; j < n/8; j += 2) {
+ temp0 = VLD(fw0 + j*2);
+ re = VDUPRE(temp0);
+ im = VDUPIM(temp0);
+ im = VXOR(im, MULI_SIGN);
+ VST(fw + j*2*6 , re);
+ VST(fw + j*2*6+4, im);
+
+ temp1 = VLD(fw1 + j*2);
+ re = VDUPRE(temp1);
+ im = VDUPIM(temp1);
+ im = VXOR(im, MULI_SIGN);
+ VST(fw + j*2*6+8 , re);
+ VST(fw + j*2*6+12, im);
+
+ temp2 = VLD(fw2 + j*2);
+ re = VDUPRE(temp2);
+ im = VDUPIM(temp2);
+ im = VXOR(im, MULI_SIGN);
+ VST(fw + j*2*6+16, re);
+ VST(fw + j*2*6+20, im);
+ }
+
+ w += n/8 * 3 * 2;
+#endif
+
+ FFTS_FREE(w0);
+ FFTS_FREE(w1);
+ FFTS_FREE(w2);
+ }
+ ///p->ws[i] = w;
+
+ n *= 2;
+ }
+
+ tmp = (float *)p->ws;
+
+#ifdef __arm__
+ if (sign < 0) {
+ p->oe_ws = (void*)(&w_data[4]);
+ p->ee_ws = (void*)(w_data);
+ p->eo_ws = (void*)(&w_data[4]);
+ } else {
+ p->oe_ws = (void*)(w_data + 12);
+ p->ee_ws = (void*)(w_data + 8);
+ p->eo_ws = (void*)(w_data + 12);
+ }
+#endif
+
+ p->lastlut = w;
+ p->n_luts = n_luts;
+ return 0;
- return p;
+cleanup:
+ return -1;
}
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+ffts_plan_t *ffts_init_1d(size_t N, int sign)
+{
+ const size_t leaf_N = 8;
+ ffts_plan_t *p;
+
+ if (N < 2 || (N & (N - 1)) != 0) {
+ LOG("FFT size must be a power of two\n");
+ return NULL;
+ }
+
+ p = calloc(1, sizeof(*p));
+ if (!p) {
+ return NULL;
+ }
+
+ p->destroy = ffts_free_1d;
+ p->N = N;
+
+ if (ffts_generate_luts(p, N, leaf_N, sign)) {
+ goto cleanup;
+ }
+
+ if (N >= 32) {
+ p->offsets = ffts_init_offsets(N, leaf_N);
+ if (!p->offsets) {
+ goto cleanup;
+ }
+
+ p->is = ffts_init_is(N, leaf_N, 1);
+ if (!p->is) {
+ goto cleanup;
+ }
+
+ p->i0 = N/leaf_N/3 + 1;
+ p->i1 = p->i2 = N/leaf_N/3;
+ if ((N/leaf_N) % 3 > 1) {
+ p->i1++;
+ }
+
+#ifdef __arm__
+#ifdef HAVE_NEON
+ p->i0 /= 2;
+ p->i1 /= 2;
+#endif
+#else
+ p->i0 /= 2;
+ p->i1 /= 2;
+#endif
+
+#ifdef DYNAMIC_DISABLED
+ if (sign < 0) {
+ p->transform = ffts_static_transform_f;
+ } else {
+ p->transform = ffts_static_transform_i;
+ }
+#else
+ /* determinate transform size */
+#ifdef __arm__
+ if (N < 8192) {
+ p->transform_size = 8192;
+ } else {
+ p->transform_size = N;
+ }
+#else
+ if (N < 2048) {
+ p->transform_size = 16384;
+ } else {
+ p->transform_size = 16384 + 2*N/8 * ffts_ctzl(N);
+ }
+#endif
+
+ /* allocate code/function buffer */
+ p->transform_base = ffts_vmem_alloc(p->transform_size);
+ if (!p->transform_base) {
+ goto cleanup;
+ }
+
+ /* generate code */
+ p->transform = ffts_generate_func_code(p, N, leaf_N, sign);
+ if (!p->transform) {
+ goto cleanup;
+ }
+
+ /* enable execution with read access for the block */
+ if (ffts_allow_execute(p->transform_base, p->transform_size)) {
+ goto cleanup;
+ }
+
+ /* flush from the instruction cache */
+ if (ffts_flush_instruction_cache(p->transform_base, p->transform_size)) {
+ goto cleanup;
+ }
+#endif
+ } else {
+ p->transforms = malloc(2 * sizeof(*p->transforms));
+ if (!p->transforms) {
+ goto cleanup;
+ }
+
+ p->transforms[0] = 0;
+ p->transforms[1] = 1;
+
+ switch (N) {
+ case 2:
+ p->transform = &ffts_firstpass_2;
+ break;
+ case 4:
+ if (sign == -1) {
+ p->transform = &ffts_firstpass_4_f;
+ } else if (sign == 1) {
+ p->transform = &ffts_firstpass_4_b;
+ }
+ break;
+ case 8:
+ if (sign == -1) {
+ p->transform = &ffts_firstpass_8_f;
+ } else if (sign == 1) {
+ p->transform = &ffts_firstpass_8_b;
+ }
+ break;
+ case 16:
+ default:
+ if (sign == -1) {
+ p->transform = &ffts_firstpass_16_f;
+ } else {
+ p->transform = &ffts_firstpass_16_b;
+ }
+ break;
+ }
+ }
+
+ return p;
+
+cleanup:
+ ffts_free_1d(p);
+ return NULL;
+} \ No newline at end of file
diff --git a/src/ffts.h b/src/ffts.h
index cd9e24b..ca2951a 100644
--- a/src/ffts.h
+++ b/src/ffts.h
@@ -30,21 +30,21 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+
#ifndef __CP_SSE_H__
#define __CP_SSE_H__
-#include "config.h"
+//#include "config.h"
+//#include "codegen.h"
+#include "ffts_attributes.h"
+#include "types.h"
-#include <stdio.h>
-#include <stdlib.h>
+#include <malloc.h>
#include <math.h>
#include <stddef.h>
#include <stdint.h>
-//#include <stdalign.h>
-
-//#include "codegen.h"
-#include "types.h"
-
+#include <stdlib.h>
+#include <stdio.h>
#ifdef __ANDROID__
#include <android/log.h>
@@ -53,29 +53,14 @@
#define LOG(s) fprintf(stderr, s)
#endif
-#define PI 3.1415926535897932384626433832795028841971693993751058209
-
-static const __attribute__ ((aligned(64))) float w_data[16] = {
- 0.70710678118654757273731092936941, 0.70710678118654746171500846685376,
- -0.70710678118654757273731092936941, -0.70710678118654746171500846685376,
- 1.0f, 0.70710678118654757273731092936941f,
- -0.0f, -0.70710678118654746171500846685376,
- 0.70710678118654757273731092936941, 0.70710678118654746171500846685376,
- 0.70710678118654757273731092936941, 0.70710678118654746171500846685376,
- 1.0f, 0.70710678118654757273731092936941f,
- 0.0f, 0.70710678118654746171500846685376
-};
-
-__INLINE float W_re(float N, float k) { return cos(-2.0f * PI * k / N); }
-__INLINE float W_im(float N, float k) { return sin(-2.0f * PI * k / N); }
-
-typedef size_t transform_index_t;
-
-//typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
-typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
+#ifndef M_PI
+#define M_PI 3.1415926535897932384626433832795028841971693993751058209
+#endif
typedef struct _ffts_plan_t ffts_plan_t;
+typedef void (*transform_func_t)(ffts_plan_t *p, const void *in, void *out);
+
/**
* Contains all the Information need to perform FFT
*
@@ -86,101 +71,153 @@ typedef struct _ffts_plan_t ffts_plan_t;
*/
struct _ffts_plan_t {
- /**
- *
- */
- ptrdiff_t *offsets;
+ /**
+ *
+ */
+ ptrdiff_t *offsets;
#ifdef DYNAMIC_DISABLED
- /**
- * Twiddle factors
- */
- void *ws;
- /**
- * ee - 2 size x size8
- * oo - 2 x size4 in parallel
- * oe -
- */
- void *oe_ws, *eo_ws, *ee_ws;
+ /**
+ * Twiddle factors
+ */
+ void *ws;
+
+ /**
+ * ee - 2 size x size8
+ * oo - 2 x size4 in parallel
+ * oe -
+ */
+ void *oe_ws, *eo_ws, *ee_ws;
#else
- void __attribute__((aligned(32))) *ws;
- void __attribute__((aligned(32))) *oe_ws, *eo_ws, *ee_ws;
+ void FFTS_ALIGN(32) *ws;
+ void FFTS_ALIGN(32) *oe_ws, *eo_ws, *ee_ws;
#endif
- /**
- * Pointer into an array of precomputed indexes for the input data array
- */
- ptrdiff_t *is;
-
- /**
- * Twiddle Factor Indexes
- */
- size_t *ws_is;
-
- /**
- * Size of the loops for the base cases
- */
- size_t i0, i1, n_luts;
-
- /**
- * Size fo the Transform
- */
- size_t N;
- void *lastlut;
- /**
- * Used in multidimensional Code ??
- */
- transform_index_t *transforms;
- //transform_func_t transform;
-
- /**
- * Pointer to the dynamically generated function
- * that will execute the FFT
- */
- void (*transform)(ffts_plan_t * , const void * , void * );
-
- /**
- * Pointer to the base memory address of
- * of the transform function
- */
- void *transform_base;
-
- /**
- * Size of the memory block contain the
- * generated code
- */
- size_t transform_size;
-
- /**
- * Points to the cosnant variables used by
- * the Assembly Code
- */
- void *constants;
-
- // multi-dimensional stuff:
- struct _ffts_plan_t **plans;
- int rank;
- size_t *Ns, *Ms;
- void *buf;
-
- void *transpose_buf;
-
- /**
- * Pointer to the destroy function
- * to clean up the plan after use
- * (differs for real and multi dimension transforms
- */
- void (*destroy)(ffts_plan_t *);
-
- /**
- * Coefficiants for the real valued transforms
- */
- float *A, *B;
-
- size_t i2;
+
+ /**
+ * Pointer into an array of precomputed indexes for the input data array
+ */
+ ptrdiff_t *is;
+
+ /**
+ * Twiddle Factor Indexes
+ */
+ size_t *ws_is;
+
+ /**
+ * Size of the loops for the base cases
+ */
+ size_t i0, i1, n_luts;
+
+ /**
+ * Size fo the Transform
+ */
+ size_t N;
+ void *lastlut;
+
+ /**
+ * Used in multidimensional Code ??
+ */
+ size_t *transforms;
+
+ /**
+ * Pointer to the dynamically generated function
+ * that will execute the FFT
+ */
+ transform_func_t transform;
+
+ /**
+ * Pointer to the base memory address of
+ * of the transform function
+ */
+ void *transform_base;
+
+ /**
+ * Size of the memory block contain the
+ * generated code
+ */
+ size_t transform_size;
+
+ /**
+ * Points to the cosnant variables used by
+ * the Assembly Code
+ */
+ void *constants;
+
+ // multi-dimensional stuff:
+ struct _ffts_plan_t **plans;
+ int rank;
+ size_t *Ns, *Ms;
+ void *buf;
+
+ void *transpose_buf;
+
+ /**
+ * Pointer to the destroy function
+ * to clean up the plan after use
+ * (differs for real and multi dimension transforms
+ */
+ void (*destroy)(ffts_plan_t *);
+
+ /**
+ * Coefficiants for the real valued transforms
+ */
+ float *A, *B;
+
+ size_t i2;
};
+static FFTS_INLINE void *ffts_aligned_malloc(size_t size)
+{
+#if defined(_MSC_VER)
+ return _aligned_malloc(size, 32);
+#else
+ return valloc(size);
+#endif
+}
-void ffts_free(ffts_plan_t *);
+static FFTS_INLINE void ffts_aligned_free(void *p)
+{
+#if defined(_MSC_VER)
+ _aligned_free(p);
+#else
+ free(p);
+#endif
+}
+
+#if GCC_VERSION_AT_LEAST(3,3)
+#define ffts_ctzl __builtin_ctzl
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#ifdef _M_AMD64
+#pragma intrinsic(_BitScanForward64)
+static __inline unsigned long ffts_ctzl(size_t N)
+{
+ unsigned long count;
+ _BitScanForward64((unsigned long*) &count, N);
+ return count;
+}
+#else
+#pragma intrinsic(_BitScanForward)
+static __inline unsigned long ffts_ctzl(size_t N)
+{
+ unsigned long count;
+ _BitScanForward((unsigned long*) &count, N);
+ return count;
+}
+#endif /* _WIN64 */
+#endif /* _MSC_VER */
+
+static FFTS_ALWAYS_INLINE float W_re(float N, float k)
+{
+ return cos(-2.0 * M_PI * k / N);
+}
+
+static FFTS_ALWAYS_INLINE float W_im(float N, float k)
+{
+ return sin(-2.0 * M_PI * k / N);
+}
+
+void ffts_free(ffts_plan_t *);
+void ffts_execute(ffts_plan_t *, const void *, void *);
ffts_plan_t *ffts_init_1d(size_t N, int sign);
-void ffts_execute(ffts_plan_t *, const void *, void *);
+
#endif
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
diff --git a/src/macros-sse.h b/src/macros-sse.h
index d845734..85cd02d 100644
--- a/src/macros-sse.h
+++ b/src/macros-sse.h
@@ -1,10 +1,10 @@
/*
-
+
This file is part of FFTS -- The Fastest Fourier Transform in the South
-
+
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato
-
+ Copyright (c) 2012, The University of Waikato
+
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -31,8 +31,8 @@
*/
-#ifndef __SSE_FLOAT_H__
-#define __SSE_FLOAT_H__
+#ifndef FFTS_MACROS_SSE_H
+#define FFTS_MACROS_SSE_H
#include <xmmintrin.h>
@@ -63,23 +63,27 @@ typedef __m128 V;
#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
#define FFTS_FREE(d) (_mm_free(d))
-__INLINE V IMULI(int inv, V a) {
- if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
- else return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+static FFTS_ALWAYS_INLINE V IMULI(int inv, V a)
+{
+ if (inv) {
+ return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
+ } else {
+ return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+ }
}
-
-__INLINE V IMUL(V d, V re, V im) {
- re = VMUL(re, d);
- im = VMUL(im, VSWAPPAIRS(d));
- return VSUB(re, im);
+static FFTS_ALWAYS_INLINE V IMUL(V d, V re, V im)
+{
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VSUB(re, im);
}
-__INLINE V IMULJ(V d, V re, V im) {
- re = VMUL(re, d);
- im = VMUL(im, VSWAPPAIRS(d));
- return VADD(re, im);
+static FFTS_ALWAYS_INLINE V IMULJ(V d, V re, V im)
+{
+ re = VMUL(re, d);
+ im = VMUL(im, VSWAPPAIRS(d));
+ return VADD(re, im);
}
-#endif
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+#endif /* FFTS_MACROS_SSE_H */
diff --git a/src/macros.h b/src/macros.h
index 08029a3..12c52c6 100644
--- a/src/macros.h
+++ b/src/macros.h
@@ -1,38 +1,42 @@
/*
-
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-
- Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
- Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
-
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the organization nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef __MACROS_H__
-#define __MACROS_H__
+#ifndef FFTS_MACROS_H
+#define FFTS_MACROS_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
#ifdef HAVE_NEON
#include "macros-neon.h"
@@ -44,119 +48,138 @@
#include "macros-altivec.h"
#endif
#endif
-
#endif
-
#ifdef HAVE_VFP
#include "macros-alpha.h"
#endif
+
#ifdef HAVE_SSE
- #include "macros-sse.h"
+#include "macros-sse.h"
#endif
-static inline void TX2(V *a, V *b)
+static FFTS_INLINE void TX2(V *a, V *b)
{
V TX2_t0 = VUNPACKLO(*a, *b);
V TX2_t1 = VUNPACKHI(*a, *b);
- *a = TX2_t0; *b = TX2_t1;
+ *a = TX2_t0;
+ *b = TX2_t1;
}
-static inline void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3)
+static FFTS_INLINE void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3)
{
V uk, uk2, zk_p, zk_n, zk, zk_d;
- uk = *r0; uk2 = *r1;
+
+ uk = *r0;
+ uk2 = *r1;
+
zk_p = IMUL(*r2, re, im);
zk_n = IMULJ(*r3, re, im);
- zk = VADD(zk_p, zk_n);
+ zk = VADD(zk_p, zk_n);
zk_d = IMULI(inv, VSUB(zk_p, zk_n));
-
+
*r2 = VSUB(uk, zk);
*r0 = VADD(uk, zk);
*r3 = VADD(uk2, zk_d);
*r1 = VSUB(uk2, zk_d);
}
-
-static inline void S_4(V r0, V r1, V r2, V r3,
- data_t * restrict o0, data_t * restrict o1,
- data_t * restrict o2, data_t * restrict o3)
-{
- VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
-}
-
-
-static inline void L_2_4(int inv,
- const data_t * restrict i0, const data_t * restrict i1,
- const data_t * restrict i2, const data_t * restrict i3,
- V *r0, V *r1, V *r2, V *r3)
+static FFTS_INLINE void L_2_4(int inv, const data_t* FFTS_RESTRICT i0, const data_t* FFTS_RESTRICT i1,
+ const data_t* FFTS_RESTRICT i2, const data_t* FFTS_RESTRICT i3,
+ V *r0, V *r1, V *r2, V *r3)
{
V t0, t1, t2, t3, t4, t5, t6, t7;
- t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3);
+ t0 = VLD(i0);
+ t1 = VLD(i1);
+ t2 = VLD(i2);
+ t3 = VLD(i3);
+
t4 = VADD(t0, t1);
t5 = VSUB(t0, t1);
t6 = VADD(t2, t3);
t7 = VSUB(t2, t3);
+
*r0 = VUNPACKLO(t4, t5);
*r1 = VUNPACKLO(t6, t7);
+
t5 = IMULI(inv, t5);
+
t0 = VADD(t6, t4);
t2 = VSUB(t6, t4);
t1 = VSUB(t7, t5);
t3 = VADD(t7, t5);
+
*r3 = VUNPACKHI(t0, t1);
*r2 = VUNPACKHI(t2, t3);
}
-
-static inline void L_4_4(int inv,
- const data_t * restrict i0, const data_t * restrict i1,
- const data_t * restrict i2, const data_t * restrict i3,
- V *r0, V *r1, V *r2, V *r3)
+static FFTS_INLINE void L_4_4(int inv, const data_t* FFTS_RESTRICT i0, const data_t* FFTS_RESTRICT i1,
+ const data_t* FFTS_RESTRICT i2, const data_t* FFTS_RESTRICT i3,
+ V *r0, V *r1, V *r2, V *r3)
{
V t0, t1, t2, t3, t4, t5, t6, t7;
-
- t0 = VLD(i0); t1 = VLD(i1); t2 = VLD(i2); t3 = VLD(i3);
+
+ t0 = VLD(i0);
+ t1 = VLD(i1);
+ t2 = VLD(i2);
+ t3 = VLD(i3);
+
t4 = VADD(t0, t1);
t5 = VSUB(t0, t1);
t6 = VADD(t2, t3);
+
t7 = IMULI(inv, VSUB(t2, t3));
+
t0 = VADD(t4, t6);
t2 = VSUB(t4, t6);
t1 = VSUB(t5, t7);
t3 = VADD(t5, t7);
+
TX2(&t0, &t1);
TX2(&t2, &t3);
- *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3;
-}
-
+ *r0 = t0;
+ *r2 = t1;
+ *r1 = t2;
+ *r3 = t3;
+}
-static inline void L_4_2(int inv,
- const data_t * restrict i0, const data_t * restrict i1,
- const data_t * restrict i2, const data_t * restrict i3,
- V *r0, V *r1, V *r2, V *r3)
+static FFTS_INLINE void L_4_2(int inv, const data_t * FFTS_RESTRICT i0, const data_t * FFTS_RESTRICT i1,
+ const data_t * FFTS_RESTRICT i2, const data_t * FFTS_RESTRICT i3,
+ V *r0, V *r1, V *r2, V *r3)
{
V t0, t1, t2, t3, t4, t5, t6, t7;
- t0 = VLD(i0); t1 = VLD(i1); t6 = VLD(i2); t7 = VLD(i3);
+ t0 = VLD(i0);
+ t1 = VLD(i1);
+ t6 = VLD(i2);
+ t7 = VLD(i3);
+
t2 = VBLEND(t6, t7);
t3 = VBLEND(t7, t6);
+
t4 = VADD(t0, t1);
t5 = VSUB(t0, t1);
t6 = VADD(t2, t3);
t7 = VSUB(t2, t3);
+
*r2 = VUNPACKHI(t4, t5);
- *r3 = VUNPACKHI(t6, t7);
+ *r3 = VUNPACKHI(t6, t7);
+
t7 = IMULI(inv, t7);
+
t0 = VADD(t4, t6);
t2 = VSUB(t4, t6);
t1 = VSUB(t5, t7);
t3 = VADD(t5, t7);
+
*r0 = VUNPACKLO(t0, t1);
*r1 = VUNPACKLO(t2, t3);
}
-#endif
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
+
+#define S_4(r0, r1, r2, r3, o0, o1, o2, o3) \
+ VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
+
+#endif /* FFTS_MACROS_H */ \ No newline at end of file
diff --git a/src/sse.s b/src/sse.s
index 90f02db..ccdebc8 100644
--- a/src/sse.s
+++ b/src/sse.s
@@ -54,6 +54,7 @@ _leaf_ee_init:
leaf_ee_init:
#endif
#lea L_sse_constants(%rip), %r9
+ movq (%rdi), %r8
movq 0xe0(%rdi), %r9
xorl %eax, %eax
@@ -559,9 +560,9 @@ x8_soft:
leaq (%r14,%rcx,4), %r15
X8_soft_loop:
movaps (%rsi), %xmm9
- movaps (%r10,%rax,4), %xmm6
+ movaps (%r10,%rax,4), %xmm6
movaps %xmm9, %xmm11
- movaps (%r11,%rax,4), %xmm7
+ movaps (%r11,%rax,4), %xmm7
movaps 16(%rsi), %xmm8
mulps %xmm6, %xmm11
mulps %xmm7, %xmm9
@@ -647,6 +648,14 @@ X8_soft_loop:
ret
#ifdef __APPLE__
+ .globl _x8_soft_end
+_x8_soft_end:
+#else
+ .globl x8_soft_end
+x8_soft_end:
+#endif
+
+#ifdef __APPLE__
.globl _x8_hard
_x8_hard:
#else
diff --git a/src/sse_win64.s b/src/sse_win64.s
new file mode 100644
index 0000000..6b75391
--- /dev/null
+++ b/src/sse_win64.s
@@ -0,0 +1,840 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+ .code64
+
+ .globl _neon_x4
+ .align 4
+_neon_x4:
+
+ .globl _neon_x8
+ .align 4
+_neon_x8:
+
+ .globl _neon_x8_t
+ .align 4
+_neon_x8_t:
+
+#ifdef __APPLE__
+ .globl _leaf_ee_init
+_leaf_ee_init:
+#else
+ .globl leaf_ee_init
+leaf_ee_init:
+#endif
+
+# rcx is a pointer to the ffts_plan
+# eax is loop counter (init to 0)
+# rbx is loop max count
+# rdx is 'in' base pointer
+# r8 is 'out' base pointer
+# rdi is offsets pointer
+# rsi is constants pointer
+# scratch: rax r10 r11
+
+ xorl %eax, %eax
+ movq (%rcx), %rdi
+ movq 0xe0(%rcx), %rsi
+
+# _leaf_ee + 8 needs 16 byte alignment
+#ifdef __APPLE__
+ .globl _leaf_ee
+_leaf_ee:
+#else
+ .globl leaf_ee
+leaf_ee:
+#endif
+ movaps 32(%rsi), %xmm0 #83.5
+ movaps (%rsi), %xmm8 #83.5
+LEAF_EE_1:
+LEAF_EE_const_0:
+ movaps 0xFECA(%rdx,%rax,4), %xmm7 #83.5
+LEAF_EE_const_2:
+ movaps 0xFECA(%rdx,%rax,4), %xmm12 #83.5
+ movaps %xmm7, %xmm6 #83.5
+LEAF_EE_const_3:
+ movaps 0xFECA(%rdx,%rax,4), %xmm10 #83.5
+ movaps %xmm12, %xmm11 #83.5
+ subps %xmm10, %xmm12 #83.5
+ addps %xmm10, %xmm11 #83.5
+ xorps %xmm8, %xmm12 #83.5
+LEAF_EE_const_1:
+ movaps 0xFECA(%rdx,%rax,4), %xmm9 #83.5
+LEAF_EE_const_4:
+ movaps 0xFECA(%rdx,%rax,4), %xmm10 #83.5
+ addps %xmm9, %xmm6 #83.5
+ subps %xmm9, %xmm7 #83.5
+LEAF_EE_const_5:
+ movaps 0xFECA(%rdx,%rax,4), %xmm13 #83.5
+ movaps %xmm10, %xmm9 #83.5
+LEAF_EE_const_6:
+ movaps 0xFECA(%rdx,%rax,4), %xmm3 #83.5
+ movaps %xmm6, %xmm5 #83.5
+LEAF_EE_const_7:
+ movaps 0xFECA(%rdx,%rax,4), %xmm14 #83.5
+ movaps %xmm3, %xmm15 #83.5
+ shufps $177, %xmm12, %xmm12 #83.5
+ movaps %xmm7, %xmm4 #83.5
+ movslq (%rdi, %rax, 4), %r10 #83.44
+ subps %xmm13, %xmm10 #83.5
+ subps %xmm14, %xmm3 #83.5
+ addps %xmm11, %xmm5 #83.5
+ subps %xmm11, %xmm6 #83.5
+ subps %xmm12, %xmm4 #83.5
+ addps %xmm12, %xmm7 #83.5
+ addps %xmm13, %xmm9 #83.5
+ addps %xmm14, %xmm15 #83.5
+ movaps 16(%rsi), %xmm12 #83.5
+ movaps %xmm9, %xmm1 #83.5
+ movaps 16(%rsi), %xmm11 #83.5
+ movaps %xmm5, %xmm2 #83.5
+ mulps %xmm10, %xmm12 #83.5
+ subps %xmm15, %xmm9 #83.5
+ addps %xmm15, %xmm1 #83.5
+ mulps %xmm3, %xmm11 #83.5
+ addps %xmm1, %xmm2 #83.5
+ subps %xmm1, %xmm5 #83.5
+ shufps $177, %xmm10, %xmm10 #83.5
+ xorps %xmm8, %xmm9 #83.5
+ shufps $177, %xmm3, %xmm3 #83.5
+ movaps %xmm6, %xmm1 #83.5
+ mulps %xmm0, %xmm10 #83.5
+ movaps %xmm4, %xmm13 #83.5
+ mulps %xmm0, %xmm3 #83.5
+ subps %xmm10, %xmm12 #83.5
+ addps %xmm3, %xmm11 #83.5
+ movaps %xmm12, %xmm3 #83.5
+ movaps %xmm7, %xmm14 #83.5
+ shufps $177, %xmm9, %xmm9 #83.5
+ subps %xmm11, %xmm12 #83.5
+ addps %xmm11, %xmm3 #83.5
+ subps %xmm9, %xmm1 #83.5
+ addps %xmm9, %xmm6 #83.5
+ addps %xmm3, %xmm4 #83.5
+ subps %xmm3, %xmm13 #83.5
+ xorps %xmm8, %xmm12 #83.5
+ movaps %xmm2, %xmm3 #83.5
+ shufps $177, %xmm12, %xmm12 #83.5
+ movaps %xmm6, %xmm9 #83.5
+ movslq 8(%rdi, %rax, 4), %r11 #83.59
+ movlhps %xmm4, %xmm3 #83.5
+ addq $4, %rax
+ shufps $238, %xmm4, %xmm2 #83.5
+ movaps %xmm1, %xmm4 #83.5
+ subps %xmm12, %xmm7 #83.5
+ addps %xmm12, %xmm14 #83.5
+ movlhps %xmm7, %xmm4 #83.5
+ shufps $238, %xmm7, %xmm1 #83.5
+ movaps %xmm5, %xmm7 #83.5
+ movlhps %xmm13, %xmm7 #83.5
+ movlhps %xmm14, %xmm9 #83.5
+ shufps $238, %xmm13, %xmm5 #83.5
+ shufps $238, %xmm14, %xmm6 #83.5
+ movaps %xmm3, (%r8,%r10,4) #83.5
+ movaps %xmm4, 16(%r8,%r10,4) #83.5
+ movaps %xmm7, 32(%r8,%r10,4) #83.5
+ movaps %xmm9, 48(%r8,%r10,4) #83.5
+ movaps %xmm2, (%r8,%r11,4) #83.5
+ movaps %xmm1, 16(%r8,%r11,4) #83.5
+ movaps %xmm5, 32(%r8,%r11,4) #83.5
+ movaps %xmm6, 48(%r8,%r11,4) #83.5
+ cmpq %rbx, %rax
+ jne LEAF_EE_1
+
+# _leaf_oo + 3 needs to be 16 byte aligned
+#ifdef __APPLE__
+ .globl _leaf_oo
+_leaf_oo:
+#else
+ .globl leaf_oo
+leaf_oo:
+#endif
+ movaps (%rsi), %xmm5 #92.7
+LEAF_OO_1:
+LEAF_OO_const_0:
+ movaps 0xFECA(%rdx,%rax,4), %xmm4 #93.5
+ movaps %xmm4, %xmm6 #93.5
+LEAF_OO_const_1:
+ movaps 0xFECA(%rdx,%rax,4), %xmm7 #93.5
+LEAF_OO_const_2:
+ movaps 0xFECA(%rdx,%rax,4), %xmm10 #93.5
+ addps %xmm7, %xmm6 #93.5
+ subps %xmm7, %xmm4 #93.5
+LEAF_OO_const_3:
+ movaps 0xFECA(%rdx,%rax,4), %xmm8 #93.5
+ movaps %xmm10, %xmm9 #93.5
+LEAF_OO_const_4:
+ movaps 0xFECA(%rdx,%rax,4), %xmm1 #93.5
+ movaps %xmm6, %xmm3 #93.5
+LEAF_OO_const_5:
+ movaps 0xFECA(%rdx,%rax,4), %xmm11 #93.5
+ movaps %xmm1, %xmm2 #93.5
+LEAF_OO_const_6:
+ movaps 0xFECA(%rdx,%rax,4), %xmm14 #93.5
+ movaps %xmm4, %xmm15 #93.5
+LEAF_OO_const_7:
+ movaps 0xFECA(%rdx,%rax,4), %xmm12 #93.5
+ movaps %xmm14, %xmm13 #93.5
+ movslq (%rdi, %rax, 4), %r10 #83.44
+ subps %xmm8, %xmm10 #93.5
+ addps %xmm8, %xmm9 #93.5
+ addps %xmm11, %xmm2 #93.5
+ subps %xmm12, %xmm14 #93.5
+ subps %xmm11, %xmm1 #93.5
+ addps %xmm12, %xmm13 #93.5
+ addps %xmm9, %xmm3 #93.5
+ subps %xmm9, %xmm6 #93.5
+ xorps %xmm5, %xmm10 #93.5
+ xorps %xmm5, %xmm14 #93.5
+ shufps $177, %xmm10, %xmm10 #93.5
+ movaps %xmm2, %xmm9 #93.5
+ shufps $177, %xmm14, %xmm14 #93.5
+ movaps %xmm6, %xmm7 #93.5
+ movslq 8(%rdi, %rax, 4), %r11 #83.59
+ addq $4, %rax #92.18
+ addps %xmm10, %xmm4 #93.5
+ addps %xmm13, %xmm9 #93.5
+ subps %xmm13, %xmm2 #93.5
+ subps %xmm10, %xmm15 #93.5
+ movaps %xmm1, %xmm13 #93.5
+ movaps %xmm2, %xmm8 #93.5
+ movlhps %xmm4, %xmm7 #93.5
+ subps %xmm14, %xmm13 #93.5
+ addps %xmm14, %xmm1 #93.5
+ shufps $238, %xmm4, %xmm6 #93.5
+ movaps %xmm3, %xmm14 #93.5
+ movaps %xmm9, %xmm4 #93.5
+ movlhps %xmm15, %xmm14 #93.5
+ movlhps %xmm13, %xmm4 #93.5
+ movlhps %xmm1, %xmm8 #93.5
+ shufps $238, %xmm15, %xmm3 #93.5
+ shufps $238, %xmm13, %xmm9 #93.5
+ shufps $238, %xmm1, %xmm2 #93.5
+ movaps %xmm14, (%r8,%r10,4) #93.5
+ movaps %xmm7, 16(%r8,%r10,4) #93.5
+ movaps %xmm4, 32(%r8,%r10,4) #93.5
+ movaps %xmm8, 48(%r8,%r10,4) #93.5
+ movaps %xmm3, (%r8,%r11,4) #93.5
+ movaps %xmm6, 16(%r8,%r11,4) #93.5
+ movaps %xmm9, 32(%r8,%r11,4) #93.5
+ movaps %xmm2, 48(%r8,%r11,4) #93.5
+ cmpq %rbx, %rax
+ jne LEAF_OO_1 # Prob 95% #92.14
+
+#ifdef __APPLE__
+ .globl _leaf_eo
+_leaf_eo:
+#else
+ .globl leaf_eo
+leaf_eo:
+#endif
+LEAF_EO_const_0:
+ movaps 0xFECA(%rdx,%rax,4), %xmm9 #88.5
+LEAF_EO_const_2:
+ movaps 0xFECA(%rdx,%rax,4), %xmm7 #88.5
+ movaps %xmm9, %xmm11 #88.5
+LEAF_EO_const_3:
+ movaps 0xFECA(%rdx,%rax,4), %xmm5 #88.5
+ movaps %xmm7, %xmm6 #88.5
+LEAF_EO_const_1:
+ movaps 0xFECA(%rdx,%rax,4), %xmm4 #88.5
+ subps %xmm5, %xmm7 #88.5
+ addps %xmm4, %xmm11 #88.5
+ subps %xmm4, %xmm9 #88.5
+ addps %xmm5, %xmm6 #88.5
+ movaps (%rsi), %xmm3 #88.5
+ movaps %xmm11, %xmm10 #88.5
+ xorps %xmm3, %xmm7 #88.5
+ movaps %xmm9, %xmm8 #88.5
+ shufps $177, %xmm7, %xmm7 #88.5
+ addps %xmm6, %xmm10 #88.5
+ subps %xmm6, %xmm11 #88.5
+ subps %xmm7, %xmm8 #88.5
+ addps %xmm7, %xmm9 #88.5
+ movslq 8(%rdi, %rax, 4), %r11 #83.59
+ movaps %xmm10, %xmm2 #88.5
+ movslq (%rdi, %rax, 4), %r10 #83.44
+ movaps %xmm11, %xmm1 #88.5
+ shufps $238, %xmm8, %xmm10 #88.5
+ shufps $238, %xmm9, %xmm11 #88.5
+ movaps %xmm10, (%r8,%r11,4) #88.5
+ movaps %xmm11, 16(%r8,%r11,4) #88.5
+LEAF_EO_const_4:
+ movaps 0xFECA(%rdx,%rax,4), %xmm15 #88.5
+LEAF_EO_const_5:
+ movaps 0xFECA(%rdx,%rax,4), %xmm12 #88.5
+ movaps %xmm15, %xmm14 #88.5
+LEAF_EO_const_6:
+ movaps 0xFECA(%rdx,%rax,4), %xmm4 #88.5
+ addps %xmm12, %xmm14 #88.5
+ subps %xmm12, %xmm15 #88.5
+LEAF_EO_const_7:
+ movaps 0xFECA(%rdx,%rax,4), %xmm13 #88.5
+ movaps %xmm4, %xmm5 #88.5
+ movaps %xmm14, %xmm7 #88.5
+ addps %xmm13, %xmm5 #88.5
+ subps %xmm13, %xmm4 #88.5
+ movlhps %xmm8, %xmm2 #88.5
+ movaps %xmm5, %xmm8 #88.5
+ movlhps %xmm15, %xmm7 #88.5
+ xorps %xmm3, %xmm15 #88.5
+ movaps %xmm5, %xmm6 #88.5
+ subps %xmm14, %xmm5 #88.5
+ addps %xmm14, %xmm6 #88.5
+ movlhps %xmm9, %xmm1 #88.5
+ movaps %xmm4, %xmm14 #88.5
+ movlhps %xmm4, %xmm8 #88.5
+ movaps %xmm1, %xmm12 #88.5
+ shufps $177, %xmm15, %xmm15 #88.5
+ movaps 0x30(%rsi), %xmm11 #88.5
+ addq $4, %rax #90.5
+ subps %xmm15, %xmm14 #88.5
+ mulps %xmm7, %xmm11 #88.5
+ addps %xmm15, %xmm4 #88.5
+ movaps 0x30(%rsi), %xmm9 #88.5
+ movaps 0x40(%rsi), %xmm15 #88.5
+ shufps $177, %xmm7, %xmm7 #88.5
+ mulps %xmm8, %xmm9 #88.5
+ mulps %xmm15, %xmm7 #88.5
+ shufps $177, %xmm8, %xmm8 #88.5
+ subps %xmm7, %xmm11 #88.5
+ mulps %xmm15, %xmm8 #88.5
+ movaps %xmm11, %xmm10 #88.5
+ addps %xmm8, %xmm9 #88.5
+ shufps $238, %xmm14, %xmm6 #88.5
+ subps %xmm9, %xmm11 #88.5
+ addps %xmm9, %xmm10 #88.5
+ xorps %xmm3, %xmm11 #88.5
+ movaps %xmm2, %xmm3 #88.5
+ shufps $177, %xmm11, %xmm11 #88.5
+ subps %xmm10, %xmm3 #88.5
+ addps %xmm10, %xmm2 #88.5
+ addps %xmm11, %xmm12 #88.5
+ subps %xmm11, %xmm1 #88.5
+ shufps $238, %xmm4, %xmm5 #88.5
+ movaps %xmm5, 48(%r8,%r11,4) #88.5
+ movaps %xmm6, 32(%r8,%r11,4) #88.5
+ movaps %xmm2, (%r8,%r10,4) #88.5
+ movaps %xmm1, 16(%r8,%r10,4) #88.5
+ movaps %xmm3, 32(%r8,%r10,4) #88.5
+ movaps %xmm12, 48(%r8,%r10,4) #88.5
+
+#ifdef __APPLE__
+ .globl _leaf_oe
+_leaf_oe:
+#else
+ .globl leaf_oe
+leaf_oe:
+#endif
+ movaps (%rsi), %xmm0 #59.5
+LEAF_OE_const_2:
+ movaps 0xFECA(%rdx,%rax,4), %xmm6 #70.5
+LEAF_OE_const_3:
+ movaps 0xFECA(%rdx,%rax,4), %xmm8 #70.5
+ movaps %xmm6, %xmm10 #70.5
+ shufps $228, %xmm8, %xmm10 #70.5
+ movaps %xmm10, %xmm9 #70.5
+ shufps $228, %xmm6, %xmm8 #70.5
+LEAF_OE_const_0:
+ movaps 0xFECA(%rdx,%rax,4), %xmm12 #70.5
+LEAF_OE_const_1:
+ movaps 0xFECA(%rdx,%rax,4), %xmm7 #70.5
+ movaps %xmm12, %xmm14 #70.5
+ movslq (%rdi, %rax, 4), %r10 #83.44
+ addps %xmm8, %xmm9 #70.5
+ subps %xmm8, %xmm10 #70.5
+ addps %xmm7, %xmm14 #70.5
+ subps %xmm7, %xmm12 #70.5
+ movaps %xmm9, %xmm4 #70.5
+ movaps %xmm14, %xmm13 #70.5
+ shufps $238, %xmm10, %xmm4 #70.5
+ xorps %xmm0, %xmm10 #70.5
+ shufps $177, %xmm10, %xmm10 #70.5
+ movaps %xmm12, %xmm11 #70.5
+ movaps %xmm14, %xmm5 #70.5
+ addps %xmm9, %xmm13 #70.5
+ subps %xmm10, %xmm11 #70.5
+ subps %xmm9, %xmm14 #70.5
+ shufps $238, %xmm12, %xmm5 #70.5
+ addps %xmm10, %xmm12 #70.5
+ movslq 8(%rdi, %rax, 4), %r11 #83.59
+ movlhps %xmm11, %xmm13 #70.5
+ movaps %xmm13, (%r8,%r10,4) #70.5
+ movaps 0x30(%rsi), %xmm13 #70.5
+ movlhps %xmm12, %xmm14 #70.5
+ movaps 0x40(%rsi), %xmm12 #70.5
+ mulps %xmm5, %xmm13 #70.5
+ shufps $177, %xmm5, %xmm5 #70.5
+ mulps %xmm12, %xmm5 #70.5
+ movaps %xmm14, 16(%r8,%r10,4) #70.5
+ subps %xmm5, %xmm13 #70.5
+ movaps 0x30(%rsi), %xmm5 #70.5
+ mulps %xmm4, %xmm5 #70.5
+ shufps $177, %xmm4, %xmm4 #70.5
+ mulps %xmm12, %xmm4 #70.5
+LEAF_OE_const_4:
+ movaps 0xFECA(%rdx,%rax,4), %xmm9 #70.5
+ addps %xmm4, %xmm5 #70.5
+LEAF_OE_const_6:
+ movaps 0xFECA(%rdx,%rax,4), %xmm7 #70.5
+ movaps %xmm9, %xmm3 #70.5
+LEAF_OE_const_7:
+ movaps 0xFECA(%rdx,%rax,4), %xmm2 #70.5
+ movaps %xmm7, %xmm6 #70.5
+LEAF_OE_const_5:
+ movaps 0xFECA(%rdx,%rax,4), %xmm15 #70.5
+ movaps %xmm13, %xmm4 #70.5
+ subps %xmm2, %xmm7 #70.5
+ addps %xmm15, %xmm3 #70.5
+ subps %xmm15, %xmm9 #70.5
+ addps %xmm2, %xmm6 #70.5
+ subps %xmm5, %xmm13 #70.5
+ addps %xmm5, %xmm4 #70.5
+ xorps %xmm0, %xmm7 #70.5
+ addq $4, %rax #72.5
+ movaps %xmm3, %xmm2 #70.5
+ shufps $177, %xmm7, %xmm7 #70.5
+ movaps %xmm9, %xmm8 #70.5
+ xorps %xmm0, %xmm13 #70.5
+ addps %xmm6, %xmm2 #70.5
+ subps %xmm7, %xmm8 #70.5
+ subps %xmm6, %xmm3 #70.5
+ addps %xmm7, %xmm9 #70.5
+ movaps %xmm2, %xmm10 #70.5
+ movaps %xmm3, %xmm11 #70.5
+ shufps $238, %xmm8, %xmm2 #70.5
+ shufps $238, %xmm9, %xmm3 #70.5
+ movaps %xmm2, %xmm14 #70.5
+ shufps $177, %xmm13, %xmm13 #70.5
+ subps %xmm4, %xmm14 #70.5
+ addps %xmm4, %xmm2 #70.5
+ movaps %xmm3, %xmm4 #70.5
+ subps %xmm13, %xmm3 #70.5
+ addps %xmm13, %xmm4 #70.5
+ movlhps %xmm8, %xmm10 #70.5
+ movlhps %xmm9, %xmm11 #70.5
+ movaps %xmm10, 32(%r8,%r10,4) #70.5
+ movaps %xmm11, 48(%r8,%r10,4) #70.5
+ movaps %xmm2, (%r8,%r11,4) #70.5
+ movaps %xmm3, 16(%r8,%r11,4) #70.5
+ movaps %xmm14, 32(%r8,%r11,4) #70.5
+ movaps %xmm4, 48(%r8,%r11,4) #70.5
+
+#ifdef __APPLE__
+ .globl _leaf_end
+_leaf_end:
+#else
+ .globl leaf_end
+leaf_end:
+#endif
+
+#ifdef __APPLE__
+ .globl _x_init
+_x_init:
+#else
+ .globl x_init
+x_init:
+#endif
+ movaps (%rsi), %xmm3 #34.3
+ movq 0x20(%rcx), %rdi
+#ifdef __APPLE__
+ .globl _x4
+_x4:
+#else
+ .globl x4
+x4:
+#endif
+ movaps 64(%r8), %xmm0 #34.3
+ movaps 96(%r8), %xmm1 #34.3
+ movaps (%r8), %xmm7 #34.3
+ movaps (%rdi), %xmm4 #const
+ movaps %xmm7, %xmm9 #34.3
+ movaps %xmm4, %xmm6 #34.3
+ movaps 16(%rdi), %xmm2 #const
+ mulps %xmm0, %xmm6 #34.3
+ mulps %xmm1, %xmm4 #34.3
+ shufps $177, %xmm0, %xmm0 #34.3
+ shufps $177, %xmm1, %xmm1 #34.3
+ mulps %xmm2, %xmm0 #34.3
+ mulps %xmm1, %xmm2 #34.3
+ subps %xmm0, %xmm6 #34.3
+ addps %xmm2, %xmm4 #34.3
+ movaps %xmm6, %xmm5 #34.3
+ subps %xmm4, %xmm6 #34.3
+ addps %xmm4, %xmm5 #34.3
+ movaps 32(%r8), %xmm8 #34.3
+ xorps %xmm3, %xmm6 #34.3
+ shufps $177, %xmm6, %xmm6 #34.3
+ movaps %xmm8, %xmm10 #34.3
+ movaps 112(%r8), %xmm12 #34.3
+ subps %xmm5, %xmm9 #34.3
+ addps %xmm5, %xmm7 #34.3
+ addps %xmm6, %xmm10 #34.3
+ subps %xmm6, %xmm8 #34.3
+ movaps %xmm7, (%r8) #34.3
+ movaps %xmm8, 32(%r8) #34.3
+ movaps %xmm9, 64(%r8) #34.3
+ movaps %xmm10, 96(%r8) #34.3
+ movaps 32(%rdi), %xmm14 #const #34.3
+ movaps 80(%r8), %xmm11 #34.3
+ movaps %xmm14, %xmm0 #34.3
+ movaps 48(%rdi), %xmm13 #const #34.3
+ mulps %xmm11, %xmm0 #34.3
+ mulps %xmm12, %xmm14 #34.3
+ shufps $177, %xmm11, %xmm11 #34.3
+ shufps $177, %xmm12, %xmm12 #34.3
+ mulps %xmm13, %xmm11 #34.3
+ mulps %xmm12, %xmm13 #34.3
+ subps %xmm11, %xmm0 #34.3
+ addps %xmm13, %xmm14 #34.3
+ movaps %xmm0, %xmm15 #34.3
+ subps %xmm14, %xmm0 #34.3
+ addps %xmm14, %xmm15 #34.3
+ xorps %xmm3, %xmm0 #34.3
+ movaps 16(%r8), %xmm1 #34.3
+ movaps 48(%r8), %xmm2 #34.3
+ movaps %xmm1, %xmm4 #34.3
+ shufps $177, %xmm0, %xmm0 #34.3
+ movaps %xmm2, %xmm5 #34.3
+ addps %xmm15, %xmm1 #34.3
+ subps %xmm0, %xmm2 #34.3
+ subps %xmm15, %xmm4 #34.3
+ addps %xmm0, %xmm5 #34.3
+ movaps %xmm1, 16(%r8) #34.3
+ movaps %xmm2, 48(%r8) #34.3
+ movaps %xmm4, 80(%r8) #34.3
+ movaps %xmm5, 112(%r8) #34.3
+ ret
+
+# _x8_soft + 6 needs to be 16 byte aligned
+#ifdef __APPLE__
+ .globl _x8_soft
+_x8_soft:
+#else
+ .globl x8_soft
+x8_soft:
+#endif
+ # rax, rcx, rdx, r8, r10, r11 (r9 not used)
+ # rbx, rdi, rsi
+
+ # input
+ movq %rdi, %rax
+
+ # output
+ movq %r8, %rcx
+
+ # loop stop (output + output_stride)
+ leaq (%r8, %rbx), %rdx
+
+ # 3 * output_stride
+ leaq (%rbx, %rbx, 2), %rsi
+
+ # 5 * output_stride
+ leaq (%rbx, %rbx, 4), %r10
+
+ # 7 * output_stride
+ leaq (%rsi, %rbx, 4), %r11
+
+X8_soft_loop:
+ # input + 0 * input_stride
+ movaps (%rax), %xmm9
+
+ # output + 2 * output_stride
+ movaps (%rcx, %rbx, 2), %xmm6
+
+ movaps %xmm9, %xmm11
+
+ # output + 3 * output_stride
+ movaps (%rcx, %rsi), %xmm7
+
+ # input + 1 * input_stride
+ movaps 16(%rax), %xmm8
+
+ mulps %xmm6, %xmm11
+ mulps %xmm7, %xmm9
+ shufps $177, %xmm6, %xmm6
+ mulps %xmm8, %xmm6
+ shufps $177, %xmm7, %xmm7
+ subps %xmm6, %xmm11
+ mulps %xmm7, %xmm8
+ movaps %xmm11, %xmm10
+ addps %xmm8, %xmm9
+
+ # input + 2 * input_stride
+ movaps 32(%rax), %xmm15
+
+ addps %xmm9, %xmm10
+ subps %xmm9, %xmm11
+
+ # output + 0 * output_stride
+ movaps (%rcx), %xmm5
+
+ movaps %xmm15, %xmm6
+
+ # output + 4 * output_stride
+ movaps (%rcx, %rbx, 4), %xmm12
+
+ movaps %xmm5, %xmm2
+
+ # output + 6 * output_stride
+ movaps (%rcx, %rsi, 2), %xmm13
+
+ xorps %xmm3, %xmm11 #const
+
+ # input + 3 * input_stride
+ movaps 48(%rax), %xmm14
+
+ subps %xmm10, %xmm2
+ mulps %xmm12, %xmm6
+ addps %xmm10, %xmm5
+ mulps %xmm13, %xmm15
+
+ # input + 4 * input_stride
+ movaps 64(%rax), %xmm10
+
+ movaps %xmm5, %xmm0
+ shufps $177, %xmm12, %xmm12
+ shufps $177, %xmm13, %xmm13
+ mulps %xmm14, %xmm12
+ mulps %xmm13, %xmm14
+ subps %xmm12, %xmm6
+ addps %xmm14, %xmm15
+
+ # output + 5 * output_stride
+ movaps (%rcx, %r10), %xmm7
+
+ movaps %xmm10, %xmm13
+
+ # output + 7 * output_stride
+ movaps (%rcx, %r11), %xmm8
+
+ movaps %xmm6, %xmm12
+
+ # input + 5 * input_stride
+ movaps 80(%rax), %xmm9
+
+ # input + 6 * input_stride
+ addq $96, %rax
+
+ mulps %xmm7, %xmm13
+ subps %xmm15, %xmm6
+ addps %xmm15, %xmm12
+ mulps %xmm8, %xmm10
+ subps %xmm12, %xmm0
+ addps %xmm12, %xmm5
+ shufps $177, %xmm7, %xmm7
+ xorps %xmm3, %xmm6 #const
+ shufps $177, %xmm8, %xmm8
+ movaps %xmm2, %xmm12
+ mulps %xmm9, %xmm7
+ mulps %xmm8, %xmm9
+ subps %xmm7, %xmm13
+ addps %xmm9, %xmm10
+
+ # output + 1 * output_stride
+ movaps (%rcx, %rbx), %xmm4
+
+ shufps $177, %xmm11, %xmm11
+ movaps %xmm4, %xmm1
+ shufps $177, %xmm6, %xmm6
+ addps %xmm11, %xmm1
+ subps %xmm11, %xmm4
+ addps %xmm6, %xmm12
+ subps %xmm6, %xmm2
+ movaps %xmm13, %xmm11
+ movaps %xmm4, %xmm14
+ movaps %xmm1, %xmm6
+ subps %xmm10, %xmm13
+ addps %xmm10, %xmm11
+ xorps %xmm3, %xmm13 #const
+ addps %xmm11, %xmm4
+ subps %xmm11, %xmm14
+ shufps $177, %xmm13, %xmm13
+
+ # output + 0 * output_stride
+ movaps %xmm5, (%rcx)
+
+ # output + 1 * output_stride
+ movaps %xmm4, (%rcx, %rbx)
+
+ # output + 2 * output_stride
+ movaps %xmm2, (%rcx, %rbx, 2)
+
+ subps %xmm13, %xmm1
+ addps %xmm13, %xmm6
+
+ # output + 3 * output_stride
+ movaps %xmm1, (%rcx, %rsi)
+
+ # output + 4 * output_stride
+ movaps %xmm0, (%rcx, %rbx, 4)
+
+ # output + 5 * output_stride
+ movaps %xmm14, (%rcx, %r10)
+
+ # output + 6 * output_stride
+ movaps %xmm12, (%rcx, %rsi, 2)
+
+ # output + 7 * output_stride
+ movaps %xmm6, (%rcx, %r11)
+
+ # output + 8 * output_stride
+ addq $16, %rcx
+
+ cmpq %rdx, %rcx
+ jne X8_soft_loop
+ ret
+
+#ifdef __APPLE__
+ .globl _x8_soft_end
+_x8_soft_end:
+#else
+ .globl x8_soft_end
+x8_soft_end:
+
+#ifdef __APPLE__
+ .globl _sse_leaf_ee_offsets
+ .globl _sse_leaf_oo_offsets
+ .globl _sse_leaf_eo_offsets
+ .globl _sse_leaf_oe_offsets
+ .align 4
+_sse_leaf_ee_offsets:
+ .long LEAF_EE_const_0-_leaf_ee+0x4
+ .long LEAF_EE_const_1-_leaf_ee+0x5
+ .long LEAF_EE_const_2-_leaf_ee+0x5
+ .long LEAF_EE_const_3-_leaf_ee+0x5
+ .long LEAF_EE_const_4-_leaf_ee+0x5
+ .long LEAF_EE_const_5-_leaf_ee+0x5
+ .long LEAF_EE_const_6-_leaf_ee+0x4
+ .long LEAF_EE_const_7-_leaf_ee+0x5
+_sse_leaf_oo_offsets:
+ .long LEAF_OO_const_0-_leaf_oo+0x4
+ .long LEAF_OO_const_1-_leaf_oo+0x4
+ .long LEAF_OO_const_2-_leaf_oo+0x5
+ .long LEAF_OO_const_3-_leaf_oo+0x5
+ .long LEAF_OO_const_4-_leaf_oo+0x4
+ .long LEAF_OO_const_5-_leaf_oo+0x5
+ .long LEAF_OO_const_6-_leaf_oo+0x5
+ .long LEAF_OO_const_7-_leaf_oo+0x5
+_sse_leaf_eo_offsets:
+ .long LEAF_EO_const_0-_leaf_eo+0x5
+ .long LEAF_EO_const_1-_leaf_eo+0x4
+ .long LEAF_EO_const_2-_leaf_eo+0x4
+ .long LEAF_EO_const_3-_leaf_eo+0x4
+ .long LEAF_EO_const_4-_leaf_eo+0x5
+ .long LEAF_EO_const_5-_leaf_eo+0x5
+ .long LEAF_EO_const_6-_leaf_eo+0x4
+ .long LEAF_EO_const_7-_leaf_eo+0x5
+_sse_leaf_oe_offsets:
+ .long LEAF_OE_const_0-_leaf_oe+0x5
+ .long LEAF_OE_const_1-_leaf_oe+0x4
+ .long LEAF_OE_const_2-_leaf_oe+0x4
+ .long LEAF_OE_const_3-_leaf_oe+0x5
+ .long LEAF_OE_const_4-_leaf_oe+0x5
+ .long LEAF_OE_const_5-_leaf_oe+0x5
+ .long LEAF_OE_const_6-_leaf_oe+0x4
+ .long LEAF_OE_const_7-_leaf_oe+0x4
+#else
+ .globl sse_leaf_ee_offsets
+ .globl sse_leaf_oo_offsets
+ .globl sse_leaf_eo_offsets
+ .globl sse_leaf_oe_offsets
+ .align 4
+sse_leaf_ee_offsets:
+ .long LEAF_EE_const_0-leaf_ee+0x4
+ .long LEAF_EE_const_1-leaf_ee+0x5
+ .long LEAF_EE_const_2-leaf_ee+0x5
+ .long LEAF_EE_const_3-leaf_ee+0x5
+ .long LEAF_EE_const_4-leaf_ee+0x5
+ .long LEAF_EE_const_5-leaf_ee+0x5
+ .long LEAF_EE_const_6-leaf_ee+0x4
+ .long LEAF_EE_const_7-leaf_ee+0x5
+sse_leaf_oo_offsets:
+ .long LEAF_OO_const_0-leaf_oo+0x4
+ .long LEAF_OO_const_1-leaf_oo+0x4
+ .long LEAF_OO_const_2-leaf_oo+0x5
+ .long LEAF_OO_const_3-leaf_oo+0x5
+ .long LEAF_OO_const_4-leaf_oo+0x4
+ .long LEAF_OO_const_5-leaf_oo+0x5
+ .long LEAF_OO_const_6-leaf_oo+0x5
+ .long LEAF_OO_const_7-leaf_oo+0x5
+sse_leaf_eo_offsets:
+ .long LEAF_EO_const_0-leaf_eo+0x5
+ .long LEAF_EO_const_1-leaf_eo+0x4
+ .long LEAF_EO_const_2-leaf_eo+0x4
+ .long LEAF_EO_const_3-leaf_eo+0x4
+ .long LEAF_EO_const_4-leaf_eo+0x5
+ .long LEAF_EO_const_5-leaf_eo+0x5
+ .long LEAF_EO_const_6-leaf_eo+0x4
+ .long LEAF_EO_const_7-leaf_eo+0x5
+sse_leaf_oe_offsets:
+ .long LEAF_OE_const_0-leaf_oe+0x5
+ .long LEAF_OE_const_1-leaf_oe+0x4
+ .long LEAF_OE_const_2-leaf_oe+0x4
+ .long LEAF_OE_const_3-leaf_oe+0x5
+ .long LEAF_OE_const_4-leaf_oe+0x5
+ .long LEAF_OE_const_5-leaf_oe+0x5
+ .long LEAF_OE_const_6-leaf_oe+0x4
+ .long LEAF_OE_const_7-leaf_oe+0x4
+#endif
+
+#ifdef __APPLE__
+ .data
+#else
+ .section .data
+#endif
+ .p2align 4
+#ifdef __APPLE__
+ .globl _sse_constants
+_sse_constants:
+#else
+ .globl sse_constants
+sse_constants:
+#endif
+ .long 0x00000000,0x80000000,0x00000000,0x80000000
+ .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+ .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
+ .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+ .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
+#ifdef __APPLE__
+ .globl _sse_constants_inv
+_sse_constants_inv:
+#else
+ .globl sse_constants_inv
+sse_constants_inv:
+#endif
+ .long 0x80000000,0x00000000,0x80000000,0x00000000
+ .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+ .long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
+ .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+ .long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3
OpenPOWER on IntegriCloud