summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.am16
-rw-r--r--src/Makefile.in358
-rw-r--r--src/cp_sse.c344
-rw-r--r--src/cp_sse.h40
-rw-r--r--src/macros.h308
-rw-r--r--src/patterns.c160
-rw-r--r--src/patterns.h10
7 files changed, 1236 insertions, 0 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
new file mode 100644
index 0000000..a2ea644
--- /dev/null
+++ b/src/Makefile.am
@@ -0,0 +1,16 @@
+
+OBJLIBS = libffts.a
+HDRS = cp_sse.h patterns.h macros.h
+FILES = cp_sse.c patterns.c
+OBJS = cp_sse.o patterns.o
+
+all: $(OBJLIBS)
+
+%.o: %.c $(HDRS)
+ $(CC) $(CFLAGS) -c -o $@ $< -I../include
+
+$(OBJLIBS): $(OBJS)
+ $(AR) rcs libffts.a $(OBJS)
+
+clean:
+ $(RM) -f *.o $(OBJLIBS)
diff --git a/src/Makefile.in b/src/Makefile.in
new file mode 100644
index 0000000..73fecf8
--- /dev/null
+++ b/src/Makefile.in
@@ -0,0 +1,358 @@
+# Makefile.in generated by automake 1.12.2 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2012 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+VPATH = @srcdir@
+am__make_dryrun = \
+ { \
+ am__dry=no; \
+ case $$MAKEFLAGS in \
+ *\\[\ \ ]*) \
+ echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \
+ | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
+ *) \
+ for am__flg in $$MAKEFLAGS; do \
+ case $$am__flg in \
+ *=*|--*) ;; \
+ *n*) am__dry=yes; break;; \
+ esac; \
+ done;; \
+ esac; \
+ test $$am__dry = yes; \
+ }
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+subdir = src
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+ $(ACLOCAL_M4)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+SOURCES =
+DIST_SOURCES =
+am__can_run_installinfo = \
+ case $$AM_UPDATE_INFO_DIR in \
+ n|no|NO) false;; \
+ *) (install-info --version) >/dev/null 2>&1;; \
+ esac
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+GREP = @GREP@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LTLIBOBJS = @LTLIBOBJS@
+MAKEINFO = @MAKEINFO@
+MKDIR_P = @MKDIR_P@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+VERSION = @VERSION@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build_alias = @build_alias@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host_alias = @host_alias@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+OBJLIBS = libffts.a
+HDRS = cp_sse.h patterns.h macros.h
+FILES = cp_sse.c patterns.c
+OBJS = cp_sse.o patterns.o
+all: all-am
+
+.SUFFIXES:
+$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
+ @for dep in $?; do \
+ case '$(am__configure_deps)' in \
+ *$$dep*) \
+ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+ && { if test -f $@; then exit 0; else break; fi; }; \
+ exit 1;; \
+ esac; \
+ done; \
+ echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/Makefile'; \
+ $(am__cd) $(top_srcdir) && \
+ $(AUTOMAKE) --foreign src/Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+ @case '$?' in \
+ *config.status*) \
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+ *) \
+ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+ esac;
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: $(am__configure_deps)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): $(am__aclocal_m4_deps)
+ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+tags: TAGS
+TAGS:
+
+ctags: CTAGS
+CTAGS:
+
+cscope cscopelist:
+
+
+distdir: $(DISTFILES)
+ @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+ list='$(DISTFILES)'; \
+ dist_files=`for file in $$list; do echo $$file; done | \
+ sed -e "s|^$$srcdirstrip/||;t" \
+ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+ case $$dist_files in \
+ */*) $(MKDIR_P) `echo "$$dist_files" | \
+ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+ sort -u` ;; \
+ esac; \
+ for file in $$dist_files; do \
+ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+ if test -d $$d/$$file; then \
+ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+ if test -d "$(distdir)/$$file"; then \
+ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+ fi; \
+ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+ fi; \
+ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+ else \
+ test -f "$(distdir)/$$file" \
+ || cp -p $$d/$$file "$(distdir)/$$file" \
+ || exit 1; \
+ fi; \
+ done
+check-am: all-am
+check: check-am
+all-am: Makefile
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+ if test -z '$(STRIP)'; then \
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+ install; \
+ else \
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+ "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+ fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+ -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+ @echo "This command is intended for maintainers to use"
+ @echo "it deletes files that may require special tools to rebuild."
+clean-am: clean-generic mostlyclean-am
+
+distclean: distclean-am
+ -rm -f Makefile
+distclean-am: clean-am distclean-generic
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+ -rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-generic
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: all all-am check check-am clean clean-generic distclean \
+ distclean-generic distdir dvi dvi-am html html-am info info-am \
+ install install-am install-data install-data-am install-dvi \
+ install-dvi-am install-exec install-exec-am install-html \
+ install-html-am install-info install-info-am install-man \
+ install-pdf install-pdf-am install-ps install-ps-am \
+ install-strip installcheck installcheck-am installdirs \
+ maintainer-clean maintainer-clean-generic mostlyclean \
+ mostlyclean-generic pdf pdf-am ps ps-am uninstall uninstall-am
+
+
+all: $(OBJLIBS)
+
+%.o: %.c $(HDRS)
+ $(CC) $(CFLAGS) -c -o $@ $< -I../include
+
+$(OBJLIBS): $(OBJS)
+ $(AR) rcs libffts.a $(OBJS)
+
+clean:
+ $(RM) -f *.o $(OBJLIBS)
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/src/cp_sse.c b/src/cp_sse.c
new file mode 100644
index 0000000..f59c60b
--- /dev/null
+++ b/src/cp_sse.c
@@ -0,0 +1,344 @@
+#include "cp_sse.h"
+#include "macros.h"
+#include "patterns.h"
+
+__INLINE void
+firstpass_type_1(const float * restrict in, float * restrict out, size_t N, ffts_plan_t * restrict p) {
+ size_t i, i0 = p->i0, i1 = p->i1;
+ size_t *offsets = (size_t *)p->offsets;
+ size_t *is = (size_t *)p->is;
+ for(i=i0;i>0;--i) LEAF_EE(&is, in, &offsets, out);
+ for(i=i1;i>0;--i) LEAF_OO(&is, in, &offsets, out);
+ LEAF_OE(&is, in, &offsets, out);
+ for(i=i1;i>0;--i) LEAF_EE(&is, in, &offsets, out);
+}
+
+__INLINE void
+firstpass_type_2(const float * restrict in, float * restrict out, size_t N, ffts_plan_t * restrict p) {
+ size_t i, i0 = p->i0, i1 = p->i1;
+ size_t *offsets = (size_t *)p->offsets;
+ size_t *is = (size_t *)p->is;
+ for(i=i0;i>0;--i) LEAF_EE(&is, in, &offsets, out);
+ LEAF_EO(&is, in, &offsets, out);
+ for(i=i1;i>0;--i) LEAF_OO(&is, in, &offsets, out);
+ for(i=i1;i>0;--i) LEAF_EE(&is, in, &offsets, out);
+}
+
+__INLINE void
+firstpass_64(const float * restrict in, float * restrict out, size_t N, ffts_plan_t * restrict p) {
+ size_t *offsets = (size_t *)p->offsets;
+ size_t *is = (size_t *)p->is;
+ LEAF_EE(&is, in, &offsets, out);
+ LEAF_OE(&is, in, &offsets, out);
+}
+
+void
+firstpass_32(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
+ __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;
+
+ L_4_4(in+0,in+32,in+16,in+48,&r0_1,&r2_3,&r16_17,&r18_19);
+ L_2_2(in+8,in+40,in+56,in+24,&r4_5,&r6_7,&r20_21,&r22_23);
+ K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r0_1,&r2_3,&r4_5,&r6_7);
+ L_4_2(in+4,in+36,in+20,in+52,&r8_9,&r10_11,&r28_29,&r30_31);
+ L_4_4(in+60,in+28,in+12,in+44,&r12_13,&r14_15,&r24_25,&r26_27);
+ K_N(VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,-0,0),&r0_1,&r4_5,&r8_9,&r12_13);
+ K_N(VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2_3,&r6_7,&r10_11,&r14_15);
+ K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r16_17,&r18_19,&r20_21,&r22_23);
+ K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r24_25,&r26_27,&r28_29,&r30_31);
+ K_N(VLIT4(0.98078528040323043057924223830923,0.98078528040323043057924223830923,1,1),VLIT4(0.19509032201612824808378832130984,-0.19509032201612824808378832130984,-0,0),&r0_1,&r8_9,&r16_17,&r24_25);
+ S_4(r0_1,r8_9,r16_17,r24_25,out+0,out+16,out+32,out+48);
+ K_N(VLIT4(0.8314696123025452356714026791451,0.8314696123025452356714026791451,0.92387953251128673848313610506011,0.92387953251128673848313610506011),VLIT4(0.55557023301960217764872140833177,-0.55557023301960217764872140833177,0.38268343236508978177923268049199,-0.38268343236508978177923268049199),&r2_3,&r10_11,&r18_19,&r26_27);
+ S_4(r2_3,r10_11,r18_19,r26_27,out+4,out+20,out+36,out+52);
+ K_N(VLIT4(0.55557023301960228867102387084742,0.55557023301960228867102387084742,0.70710678118654757273731092936941,0.70710678118654757273731092936941),VLIT4(0.83146961230254512464910021662945,-0.83146961230254512464910021662945,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r4_5,&r12_13,&r20_21,&r28_29);
+ S_4(r4_5,r12_13,r20_21,r28_29,out+8,out+24,out+40,out+56);
+ K_N(VLIT4(0.19509032201612830359493955256767,0.19509032201612830359493955256767,0.38268343236508983729038391174981,0.38268343236508983729038391174981),VLIT4(0.98078528040323043057924223830923,-0.98078528040323043057924223830923,0.92387953251128673848313610506011,-0.92387953251128673848313610506011),&r6_7,&r14_15,&r22_23,&r30_31);
+ S_4(r6_7,r14_15,r22_23,r30_31,out+12,out+28,out+44,out+60);
+
+}
+
+void
+firstpass_16(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
+ __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
+
+ L_4_4(in+0,in+16,in+8,in+24,&r0_1,&r2_3,&r8_9,&r10_11);
+ L_2_4(in+4,in+20,in+28,in+12,&r4_5,&r6_7,&r14_15,&r12_13);
+ K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r0_1,&r2_3,&r4_5,&r6_7);
+ K_N(VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,-0,0),&r0_1,&r4_5,&r8_9,&r12_13);
+ S_4(r0_1,r4_5,r8_9,r12_13,out+0,out+8,out+16,out+24);
+ K_N(VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2_3,&r6_7,&r10_11,&r14_15);
+ S_4(r2_3,r6_7,r10_11,r14_15,out+4,out+12,out+20,out+28);
+
+}
+void
+firstpass_8(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
+ __m128 r0_1,r2_3,r4_5,r6_7;
+ L_4_2(in+0,in+8,in+4,in+12,&r0_1,&r2_3,&r4_5,&r6_7);
+ K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r0_1,&r2_3,&r4_5,&r6_7);
+ S_4(r0_1,r2_3,r4_5,r6_7,out+0,out+4,out+8,out+12);
+}
+void
+firstpass_4(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
+ __m128 r0,r1,r2,r3;
+ L_4(in+0,in+4,in+2,in+6,&r0,&r1,&r2,&r3);
+ S_4(r0,r1,r2,r3,out+0,out+2,out+4,out+6);
+}
+void
+firstpass_2(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) {
+ __m128 r0,r1;
+ L_S2(in+0,in+2,&r0,&r1);
+ S_2(r0,r1,out+0,out+2);
+}
+
+void X_8(data_t * restrict data0, size_t N, const data_t * restrict LUT) {
+ data_t *data2 = data0 + 2*N/4;
+ data_t *data4 = data0 + 4*N/4;
+ data_t *data6 = data0 + 6*N/4;
+ data_t *data1 = data0 + 1*N/4;
+ data_t *data3 = data0 + 3*N/4;
+ data_t *data5 = data0 + 5*N/4;
+ data_t *data7 = data0 + 7*N/4;
+ size_t k, n4 = N/4;
+
+ for(k=N/8/2;k>0;--k) {
+ __m128 r0, r1, r2, r3, r4, r5, r6, r7;
+ r0 = _mm_load_ps(data0);
+ r1 = _mm_load_ps(data1);
+ r2 = _mm_load_ps(data2);
+ r3 = _mm_load_ps(data3);
+ K_N(_mm_load_ps(LUT), _mm_load_ps(LUT+4), &r0, &r1, &r2, &r3);
+ r4 = _mm_load_ps(data4);
+ r6 = _mm_load_ps(data6);
+ K_N(_mm_load_ps(LUT+8), _mm_load_ps(LUT+12), &r0, &r2, &r4, &r6);
+ r5 = _mm_load_ps(data5);
+ r7 = _mm_load_ps(data7);
+ K_N(_mm_load_ps(LUT+16), _mm_load_ps(LUT+20), &r1, &r3, &r5, &r7);
+ LUT += 24;
+ _mm_store_ps(data0, r0); data0 += 4;
+ _mm_store_ps(data1, r1); data1 += 4;
+ _mm_store_ps(data2, r2); data2 += 4;
+ _mm_store_ps(data3, r3); data3 += 4;
+ _mm_store_ps(data4, r4); data4 += 4;
+ _mm_store_ps(data5, r5); data5 += 4;
+ _mm_store_ps(data6, r6); data6 += 4;
+ _mm_store_ps(data7, r7); data7 += 4;
+ }
+}
+void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out, size_t N) {
+ transform_index_t *ps = p->transforms;
+
+ p->firstpass((const float *)in, (float *)out, N, p);
+ while(ps[0]) {
+
+ if(ps[0] == 32) {
+ float *LUT = (float *)p->ws[0];
+ float *data = (float *)(out) + ps[1];
+ size_t n = 32;
+ size_t i;
+ for(i=0;i<n/4/2;i++) {
+ __m128 uk = _mm_load_ps(data);
+ __m128 uk2 = _mm_load_ps(data + 2*n/4);
+ __m128 zk_p = _mm_load_ps(data + 4*n/4);
+ __m128 zk_n = _mm_load_ps(data + 6*n/4);
+
+ K_N(_mm_load_ps(LUT), _mm_load_ps(LUT+4), &uk, &uk2, &zk_p, &zk_n);
+
+ _mm_store_ps(data, uk);
+ _mm_store_ps(data + 2*n/4, uk2);
+ _mm_store_ps(data + 4*n/4, zk_p);
+ _mm_store_ps(data + 6*n/4, zk_n);
+
+ LUT += 8;
+ data += 4;
+ }
+
+
+ }else{
+ int index = __builtin_ctzl(ps[0])-5;
+ float *LUT = (float *)p->ws[__builtin_ctzl(ps[0])-5];
+ X_8(((float *)out) + ps[1], ps[0], LUT);
+ }
+
+ ps += 2;
+ }
+
+
+}
+
+
+ffts_plan_t *ffts_init(size_t N) {
+ ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+ size_t leafN = 16;
+ size_t i;
+
+ if(N > 32) {
+ init_offsets(p, N, leafN);
+ init_is(p, N, leafN, 2);
+ init_tree(p, N, leafN);
+
+ if(N == 64) p->firstpass = &firstpass_64;
+ else if(__builtin_ctzl(N) & 1) p->firstpass = &firstpass_type_2;
+ else p->firstpass = &firstpass_type_1;
+ /* LUTS */
+
+ size_t n_luts = __builtin_ctzl(N/leafN);
+
+ p->i0 = N/leafN/3+1;
+ p->i1 = N/leafN/3;
+ if((N/leafN) % 3 > 1) p->i1++;
+ p->i0/=2;
+ p->i1/=2;
+
+
+ // printf("n_luts = %zu\n", n_luts);
+ p->ws = malloc(n_luts * sizeof(data_t *));
+ cdata_t *w;
+
+ int n = leafN*2;
+ for(i=0;i<n_luts;i++) {
+ // printf("LUT[%zu] = %d\n", i, n);
+ if(!i) {
+
+ w = _mm_malloc(n/4 * 2 * sizeof(cdata_t), 32);
+
+ cdata_t *w0 = _mm_malloc(n/4 * sizeof(cdata_t), 32);
+
+ size_t j;
+ for(j=0;j<n/4;j++) {
+ w0[j] = W(n,j);
+ }
+
+ __m128 temp0, temp1, temp2;
+
+ float *fw = (float *)w;
+ float *fw0 = (float *)w0;
+ for(j=0;j<n/4;j+=2) {
+ temp0 = _mm_load_ps(fw0 + j*2);
+ __m128 re, im;
+ re = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 2, 0, 0));
+ im = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(3, 3, 1, 1));
+ im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
+ _mm_store_ps(fw + j*4 , re);
+ _mm_store_ps(fw + j*4+4, im);
+ }
+
+ // for(j=0;j<n/2;j++) {
+ // printf("%f %f\n", creal(w[j]), cimag(w[j]));
+
+ // }
+
+ _mm_free(w0);
+ }else{
+ w = _mm_malloc(n/8 * 3 * 2 * sizeof(cdata_t), 32);
+
+ cdata_t *w0 = _mm_malloc(n/8 * sizeof(cdata_t), 32);
+ cdata_t *w1 = _mm_malloc(n/8 * sizeof(cdata_t), 32);
+ cdata_t *w2 = _mm_malloc(n/8 * sizeof(cdata_t), 32);
+
+ size_t j;
+ for(j=0;j<n/8;j++) {
+ w0[j] = W(n,j*2);
+ w1[j] = W(n,j);
+ w2[j] = W(n,j + (n/8));
+
+ }
+
+ __m128 temp0, temp1, temp2, re, im;
+
+ float *fw = (float *)w;
+ float *fw0 = (float *)w0;
+ float *fw1 = (float *)w1;
+ float *fw2 = (float *)w2;
+ for(j=0;j<n/8;j+=2) {
+ temp0 = _mm_load_ps(fw0 + j*2);
+ re = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 2, 0, 0));
+ im = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(3, 3, 1, 1));
+ im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
+ _mm_store_ps(fw + j*2*6 , re);
+ _mm_store_ps(fw + j*2*6+4, im);
+
+ temp1 = _mm_load_ps(fw1 + j*2);
+ re = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(2, 2, 0, 0));
+ im = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 3, 1, 1));
+ im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
+ _mm_store_ps(fw + j*2*6+8 , re);
+ _mm_store_ps(fw + j*2*6+12, im);
+
+ temp2 = _mm_load_ps(fw2 + j*2);
+ re = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(2, 2, 0, 0));
+ im = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 1, 1));
+ im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
+ _mm_store_ps(fw + j*2*6+16, re);
+ _mm_store_ps(fw + j*2*6+20, im);
+ }
+
+ _mm_free(w0);
+ _mm_free(w1);
+ _mm_free(w2);
+ }
+ p->ws[i] = w;
+
+ n *= 2;
+ }
+
+ p->n_bits = log(N)/log(2) - log(leafN*2)/log(2);
+ }else{
+ p->transforms = malloc(2 * sizeof(transform_index_t));
+ p->transforms[0] = 0;
+ p->transforms[1] = 1;
+ if(N == 2) p->firstpass = &firstpass_2;
+ else if(N == 4) p->firstpass = &firstpass_4;
+ else if(N == 8) p->firstpass = &firstpass_8;
+ else if(N == 16) p->firstpass = &firstpass_16;
+ else if(N == 32) p->firstpass = &firstpass_32;
+
+ }
+
+ return p;
+}
+
+int main(int argc, char *argv[]) {
+ int n = atoi(argv[1]);
+ int count = atoi(argv[2]);
+
+ ffts_plan_t *p = ffts_init(n);
+ cdata_t *in = _mm_malloc(n * sizeof(cdata_t), 32);
+ cdata_t *out = _mm_malloc(n * sizeof(cdata_t), 32);
+
+ size_t i;
+ for(i=0;i<n;i++) in[i] = i;
+
+ p->leaftime = 0;
+
+ if(count>1){
+ for(i=0;i<count;i++) ffts_execute(p, (const float *)in, (float *)out, n);
+ }
+ p->leaftime = 0;
+
+ uint64_t start = mach_absolute_time();
+ for(i=0;i<count;i++) ffts_execute(p, (const float *)in, (float *)out, n);
+ uint64_t elapsed = mach_absolute_time() - start;
+
+ static double conversion = 0.0;
+
+ if( conversion == 0.0 )
+ {
+ mach_timebase_info_data_t info;
+ kern_return_t err = mach_timebase_info( &info );
+
+ //Convert the timebase into seconds
+ if( err == 0 )
+ conversion = 1e-9 * (double) info.numer / (double) info.denom;
+ }
+
+ double tt = conversion * (double) elapsed;
+ tt /= (double) count;
+ double lt = conversion * (double) (p->leaftime);
+// lt /= (double) count;
+ for(i=0;i<n;i++) printf("%f %f\n", creal(out[i]), cimag(out[i]));
+
+ double ctgs = 5.0f * (double)n * (log(n)/log(2)) / (tt * 1000000000.0f);
+
+ printf("Time: %f seconds, CTGs: %f Leaftime: %f \n", tt, ctgs, lt);
+
+ return 0;
+}
diff --git a/src/cp_sse.h b/src/cp_sse.h
new file mode 100644
index 0000000..c6d15dc
--- /dev/null
+++ b/src/cp_sse.h
@@ -0,0 +1,40 @@
+#ifndef __CP_SSE_H__
+#define __CP_SSE_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <complex.h>
+#include <stddef.h>
+#include <xmmintrin.h>
+#include <stdint.h>
+#include <mach/mach_time.h>
+
+typedef complex float cdata_t;
+typedef float data_t;
+
+#define W(N,k) (cexp(-2.0f * M_PI * I * (float)(k) / (float)(N)))
+
+typedef size_t transform_index_t;
+
+struct _ffts_plan_t {
+ ptrdiff_t *is;
+ ptrdiff_t *offsets;
+ void __attribute__ ((aligned(32))) **ws;
+ void (*firstpass)(const float * restrict, float * restrict, size_t, struct _ffts_plan_t * restrict);
+ size_t i0, i1, i2;
+ uint64_t n_bits, leaftime;
+
+ transform_index_t *transforms;
+};
+
+
+typedef struct _ffts_plan_t ffts_plan_t;
+
+
+typedef struct _split_vec_t {
+ __m128 re, im;
+} split_vec_t;
+
+
+#endif
diff --git a/src/macros.h b/src/macros.h
new file mode 100644
index 0000000..0a84802
--- /dev/null
+++ b/src/macros.h
@@ -0,0 +1,308 @@
+#ifndef __MACROS_H__
+#define __MACROS_H__
+
+#include "cp_sse.h"
+
+#define __INLINE static inline __attribute__((always_inline))
+
+#define VLIT4 _mm_set_ps
+
+__INLINE __m128 IMULI(__m128 a) {
+ __m128 temp = _mm_xor_ps(a, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
+ return _mm_shuffle_ps(temp, temp, _MM_SHUFFLE(2,3,0,1));
+}
+
+__INLINE void
+S_4(__m128 r0, __m128 r1, __m128 r2, __m128 r3, data_t * restrict o0, data_t * restrict o1, data_t * restrict o2, data_t * restrict o3) {
+ __m128 t0, t1, t2, t3;
+ _mm_store_ps(o0, r0);
+ _mm_store_ps(o1, r1);
+ _mm_store_ps(o2, r2);
+ _mm_store_ps(o3, r3);
+}
+__INLINE void S_2(__m128 r0, __m128 r1, data_t * restrict o0, data_t * restrict o1) {
+ _mm_store_ps(o0, r0);
+ _mm_store_ps(o1, r1);
+}
+
+__INLINE void L_S2(const data_t * restrict i0, const data_t * restrict i1, __m128 * restrict r0, __m128 * restrict r1) {
+ __m128 t0, t1;
+ t0 = _mm_load_ps(i0);
+ t1 = _mm_load_ps(i1);
+ *r0 = _mm_add_ps(t0, t1);
+ *r1 = _mm_sub_ps(t0, t1);
+}
+__INLINE void
+L_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3,
+ __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
+ __m128 t0, t1, t2, t3;
+ t0 = _mm_load_ps(i0);
+ t1 = _mm_load_ps(i1);
+ t2 = _mm_load_ps(i2);
+ t3 = _mm_load_ps(i3);
+ *r0 = _mm_add_ps (t0, t1);
+ *r1 = _mm_sub_ps (t0, t1);
+ *r2 = _mm_add_ps (t2, t3);
+ *r3 = _mm_sub_ps (t2, t3);
+}
+
+__INLINE void
+L_4(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3,
+ __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
+ __m128 t0, t1, t2, t3, t4, t5, t6, t7;
+ t0 = _mm_load_ps(i0);
+ t1 = _mm_load_ps(i1);
+ t2 = _mm_load_ps(i2);
+ t3 = _mm_load_ps(i3);
+ t4 = _mm_add_ps (t0, t1);
+ t5 = _mm_sub_ps (t0, t1);
+ t6 = _mm_add_ps (t2, t3);
+ t7 = IMULI(_mm_sub_ps (t2, t3));
+ *r0 = _mm_add_ps (t4, t6);
+ *r2 = _mm_sub_ps (t4, t6);
+ *r1 = _mm_sub_ps (t5, t7);
+ *r3 = _mm_add_ps (t5, t7);
+}
+
+__INLINE void
+K_0(__m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
+ __m128 uk, uk2, zk, zk_d;
+ uk = *r0;
+ uk2 = *r1;
+ zk = _mm_add_ps(*r2, *r3);
+ zk_d = IMULI(_mm_sub_ps(*r2, *r3));
+ *r0 = _mm_add_ps(uk, zk);
+ *r2 = _mm_sub_ps(uk, zk);
+ *r1 = _mm_sub_ps(uk2, zk_d);
+ *r3 = _mm_add_ps(uk2, zk_d);
+}
+
+__INLINE __m128 IMUL(__m128 d, __m128 re, __m128 im) {
+ re = _mm_mul_ps(re, d);
+ im = _mm_mul_ps(im, _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,3,0,1)));
+ return _mm_sub_ps(re, im);
+}
+
+__INLINE __m128 IMULJ(__m128 d, __m128 re, __m128 im) {
+ re = _mm_mul_ps(re, d);
+ im = _mm_mul_ps(im, _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,3,0,1)));
+ return _mm_add_ps(re, im);
+}
+
+__INLINE void
+K_N(__m128 re, __m128 im, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
+ __m128 uk, uk2, zk_p, zk_n, zk, zk_d;
+
+ uk = *r0;
+ uk2 = *r1;
+ zk_p = IMUL(*r2, re, im);
+ zk_n = IMULJ(*r3, re, im);
+
+ zk = _mm_add_ps(zk_p, zk_n);
+ zk_d = IMULI(_mm_sub_ps(zk_p, zk_n));
+
+ *r2 = _mm_sub_ps(uk, zk);
+ *r0 = _mm_add_ps(uk, zk);
+ *r3 = _mm_add_ps(uk2, zk_d);
+ *r1 = _mm_sub_ps(uk2, zk_d);
+}
+
+__INLINE void TX2(__m128 *a, __m128 *b) {
+ __m128 TX2_t0 = _mm_shuffle_ps(*a, *b, _MM_SHUFFLE(1,0,1,0));
+ __m128 TX2_t1 = _mm_shuffle_ps(*a, *b, _MM_SHUFFLE(3,2,3,2));
+ *a = TX2_t0; *b = TX2_t1;
+}
+
+__INLINE void
+LEAF_EE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
+ __m128 r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
+
+ data_t *out0 = out + (*out_offsets)[0];
+ data_t *out1 = out + (*out_offsets)[1];
+ *out_offsets += 2;
+
+ L_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0,&r1,&r2,&r3);
+ L_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4,&r5,&r6,&r7);
+ K_0(&r0,&r2,&r4,&r6);
+ K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r1,&r3,&r5,&r7);
+ L_4(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8,&r9,&r10,&r11);
+ L_4(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12,&r13,&r14,&r15);
+ K_0(&r0,&r4,&r8,&r12);
+ K_N(_mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011),_mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199),&r1,&r5,&r9,&r13);
+ TX2(&r0,&r1);
+ TX2(&r4,&r5);
+ TX2(&r8,&r9);
+ TX2(&r12,&r13);
+ S_4(r0,r4,r8,r12,out0+0,out0+8,out0+16,out0+24);
+ S_4(r1,r5,r9,r13,out1+0,out1+8,out1+16,out1+24);
+ K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2,&r6,&r10,&r14);
+ K_N(_mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981),_mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011),&r3,&r7,&r11,&r15);
+ TX2(&r2,&r3);
+ TX2(&r6,&r7);
+ TX2(&r10,&r11);
+ TX2(&r14,&r15);
+ S_4(r2,r6,r10,r14,out0+4,out0+12,out0+20,out0+28);
+ S_4(r3,r7,r11,r15,out1+4,out1+12,out1+20,out1+28);
+ *is += 16;
+}
+
+
+__INLINE void
+LEAF_OO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
+ __m128 r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
+
+ data_t *out0 = out + (*out_offsets)[0];
+ data_t *out1 = out + (*out_offsets)[1];
+ *out_offsets += 2;
+
+ L_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0,&r1,&r2,&r3);
+ L_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4,&r5,&r6,&r7);
+ K_0(&r0,&r2,&r4,&r6);
+ K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r1,&r3,&r5,&r7);
+ TX2(&r0,&r1);
+ TX2(&r2,&r3);
+ TX2(&r4,&r5);
+ TX2(&r6,&r7);
+ S_4(r0,r2,r4,r6,out0+0,out0+4,out0+8,out0+12);
+ S_4(r1,r3,r5,r7,out1+0,out1+4,out1+8,out1+12);
+ L_4(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8,&r9,&r10,&r11);
+ L_2(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12,&r13,&r14,&r15);
+ K_0(&r8,&r10,&r12,&r14);
+ K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r9,&r11,&r13,&r15);
+ TX2(&r8,&r9);
+ TX2(&r10,&r11);
+ TX2(&r12,&r13);
+ TX2(&r14,&r15);
+ S_4(r8,r10,r12,r14,out0+16,out0+20,out0+24,out0+28);
+ S_4(r9,r11,r13,r15,out1+16,out1+20,out1+24,out1+28);
+
+ *is += 16;
+}
+
+__INLINE void
+L_4_4(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3,
+ __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
+ __m128 t0, t1, t2, t3, t4, t5, t6, t7;
+ t0 = _mm_load_ps(i0); t1 = _mm_load_ps(i1); t2 = _mm_load_ps(i2); t3 = _mm_load_ps(i3);
+ t4 = _mm_add_ps(t0, t1);
+ t5 = _mm_sub_ps(t0, t1);
+ t6 = _mm_add_ps(t2, t3);
+ t7 = IMULI(_mm_sub_ps(t2, t3));
+ t0 = _mm_add_ps(t4, t6);
+ t2 = _mm_sub_ps(t4, t6);
+ t1 = _mm_sub_ps(t5, t7);
+ t3 = _mm_add_ps(t5, t7);
+ TX2(&t0,&t1);
+ TX2(&t2,&t3);
+ *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; }
+
+__INLINE void
+L_2_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3,
+ __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
+ __m128 t0, t1, t2, t3, t4, t5, t6, t7;
+ t0 = _mm_load_ps(i0); t1 = _mm_load_ps(i1); t2 = _mm_load_ps(i2); t3 = _mm_load_ps(i3); t4 = _mm_add_ps(t0, t1);
+ t5 = _mm_sub_ps(t0, t1);
+ t6 = _mm_add_ps(t2, t3);
+ t7 = _mm_sub_ps(t2, t3);
+ TX2(&t4,&t5);
+ TX2(&t6,&t7);
+ *r0 = t4; *r2 = t5; *r1 = t6; *r3 = t7;
+}
+
+__INLINE void
+L_2_4(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3,
+ __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
+ __m128 t0, t1, t2, t3, t4, t5, t6, t7;
+ t0 = _mm_load_ps(i0); t1 = _mm_load_ps(i1); t2 = _mm_load_ps(i2); t3 = _mm_load_ps(i3);
+ t4 = _mm_add_ps(t0, t1);
+ t5 = _mm_sub_ps(t0, t1);
+ t6 = _mm_add_ps(t2, t3);
+ t7 = _mm_sub_ps(t2, t3);
+ *r0 = _mm_shuffle_ps(t4, t5, _MM_SHUFFLE(1,0,1,0));
+ *r1 = _mm_shuffle_ps(t6, t7, _MM_SHUFFLE(1,0,1,0));
+ t5 = IMULI(t5);
+ t0 = _mm_add_ps(t6, t4);
+ t2 = _mm_sub_ps(t6, t4);
+ t1 = _mm_sub_ps(t7, t5);
+ t3 = _mm_add_ps(t7, t5);
+ *r3 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,2,3,2));
+ *r2 = _mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,2,3,2));
+}
+
+__INLINE void
+L_4_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3,
+ __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
+ __m128 t0, t1, t2, t3, t4, t5, t6, t7;
+ t0 = _mm_load_ps(i0);
+ t1 = _mm_load_ps(i1);
+ t6 = _mm_load_ps(i2);
+ t7 = _mm_load_ps(i3);
+ t2 = _mm_shuffle_ps(t6, t7, _MM_SHUFFLE(3,2,1,0));
+ t3 = _mm_shuffle_ps(t7, t6, _MM_SHUFFLE(3,2,1,0));
+ t4 = _mm_add_ps(t0, t1);
+ t5 = _mm_sub_ps(t0, t1);
+ t6 = _mm_add_ps(t2, t3);
+ t7 = _mm_sub_ps(t2, t3);
+ *r2 = _mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,2,3,2));
+ *r3 = _mm_shuffle_ps(t6, t7, _MM_SHUFFLE(3,2,3,2));
+ t7 = IMULI(t7);
+ t0 = _mm_add_ps(t4, t6);
+ t2 = _mm_sub_ps(t4, t6);
+ t1 = _mm_sub_ps(t5, t7);
+ t3 = _mm_add_ps(t5, t7);
+ *r0 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(1,0,1,0));
+ *r1 = _mm_shuffle_ps(t2, t3, _MM_SHUFFLE(1,0,1,0));
+}
+
+__INLINE void
+LEAF_OE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
+ __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;
+
+ data_t *out0 = out + (*out_offsets)[0];
+ data_t *out1 = out + (*out_offsets)[1];
+ *out_offsets += 2;
+
+ L_4_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0_1,&r2_3,&r24_25,&r26_27);
+ L_2_4(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4_5,&r6_7,&r30_31,&r28_29);
+ K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r0_1,&r2_3,&r4_5,&r6_7);
+ S_4(r0_1,r2_3,r4_5,r6_7,out0+0,out0+4,out0+8,out0+12);
+ L_4_4(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8_9,&r10_11,&r16_17,&r18_19);
+ L_2_2(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12_13,&r14_15,&r20_21,&r22_23);
+ K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r8_9,&r10_11,&r12_13,&r14_15);
+ S_4(r8_9,r10_11,r12_13,r14_15,out0+16,out0+20,out0+24,out0+28);
+ K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r16_17,&r18_19,&r20_21,&r22_23);
+ K_N(_mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),_mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0),&r16_17,&r20_21,&r24_25,&r28_29);
+ S_4(r16_17,r20_21,r24_25,r28_29,out1+0,out1+8,out1+16,out1+24);
+ K_N(_mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r18_19,&r22_23,&r26_27,&r30_31);
+ S_4(r18_19,r22_23,r26_27,r30_31,out1+4,out1+12,out1+20,out1+28);
+
+ *is += 16;
+}
+
+__INLINE void
+LEAF_EO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) {
+ __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;
+
+ data_t *out0 = out + (*out_offsets)[0];
+ data_t *out1 = out + (*out_offsets)[1];
+ *out_offsets += 2;
+
+ L_4_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0_1,&r2_3,&r16_17,&r18_19);
+ L_2_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4_5,&r6_7,&r20_21,&r22_23);
+ K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r0_1,&r2_3,&r4_5,&r6_7);
+ L_4_2(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8_9,&r10_11,&r28_29,&r30_31);
+ L_4_4(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12_13,&r14_15,&r24_25,&r26_27);
+ K_N(_mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),_mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0),&r0_1,&r4_5,&r8_9,&r12_13);
+ S_4(r0_1,r4_5,r8_9,r12_13,out0+0,out0+8,out0+16,out0+24);
+ K_N(_mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2_3,&r6_7,&r10_11,&r14_15);
+ S_4(r2_3,r6_7,r10_11,r14_15,out0+4,out0+12,out0+20,out0+28);
+ K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r16_17,&r18_19,&r20_21,&r22_23);
+ S_4(r16_17,r18_19,r20_21,r22_23,out1+0,out1+4,out1+8,out1+12);
+ K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r24_25,&r26_27,&r28_29,&r30_31);
+ S_4(r24_25,r26_27,r28_29,r30_31,out1+16,out1+20,out1+24,out1+28);
+
+ *is += 16;
+}
+
+
+#endif
diff --git a/src/patterns.c b/src/patterns.c
new file mode 100644
index 0000000..1ab593f
--- /dev/null
+++ b/src/patterns.c
@@ -0,0 +1,160 @@
+#include "patterns.h"
+
+void permute_addr(int N, int offset, int stride, int *d) {
+ int i, a[4] = {0,2,1,3};
+ for(i=0;i<4;i++) {
+ d[i] = offset + (a[i] << stride);
+ if(d[i] < 0) d[i] += N;
+ }
+}
+
+void hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offset, int stride, int even, int VL) {
+
+ if(N > 4) {
+ hardcodedleaf_is_rec(is, bigN, N/2, poffset, offset, stride + 1, even, VL);
+ if(N/4 >= 4) hardcodedleaf_is_rec(is, bigN, N/4, poffset+(1<<stride),offset+(N/2), stride + 2, 0, VL);
+ if(N/4 >= 4) hardcodedleaf_is_rec(is, bigN, N/4, poffset-(1<<stride),offset+(3*N/4), stride + 2, 0, VL);
+ else {
+ int temp = poffset+(1<<stride);
+ if(temp < 0) temp += bigN;
+ temp *= 2;
+
+ if(!(temp % (VL*2))) {
+ (*is)[0] = poffset+(1<<stride);
+ (*is)[1] = poffset+(1<<stride)+(1<<(stride+2));
+ (*is)[2] = poffset-(1<<stride);
+ (*is)[3] = poffset-(1<<stride)+(1<<(stride+2));
+ int i;
+ for(i=0;i<4;i++) if((*is)[i] < 0) (*is)[i] += bigN;
+ for(i=0;i<4;i++) (*is)[i] *= 2;
+ *is += 4;
+ }
+ }
+ }else if(N == 4) {
+ int perm[4];
+ permute_addr(bigN, poffset, stride, perm);
+ if(!((perm[0]*2) % (VL*2))) {
+ int i;
+ for(i=0;i<4;i++) {
+ (*is)[i] = perm[i] * 2;
+ }
+ *is += 4;
+ }
+ }
+}
+
+void init_is(ffts_plan_t *p, int N, int leafN, int VL) {
+ int i, i0 = N/leafN/3+1, i1=N/leafN/3, i2 = N/leafN/3;
+ int stride = log(N/leafN)/log(2);
+
+ p->is = malloc(N/VL * sizeof(ptrdiff_t));
+
+ ptrdiff_t *is = p->is;
+
+ if((N/leafN) % 3 > 1) i1++;
+
+ for(i=0;i<i0;i++) hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
+ for(i=i0;i<i0+i1;i++) {
+ hardcodedleaf_is_rec(&is, N, leafN/2, i, 0, stride+1, 1, VL);
+ hardcodedleaf_is_rec(&is, N, leafN/2, i-(1<<stride), 0, stride+1, 1, VL);
+ }
+ for(i=0-i2;i<0;i++) hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
+
+
+//for(i=0;i<N/VL;i++) {
+// printf("%td ", p->is[i]);
+// if(i % 16 == 15) printf("\n");
+//}
+
+ p->i0 = i0; p->i1 = i1; p->i2 = i2;
+}
+
+void elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) {
+ if((even && N == leafN) || (!even && N <= leafN)) {
+ offsets[2*(ooffset/leafN)] = ioffset*2;
+ offsets[2*(ooffset/leafN)+1] = ooffset;
+ }else if(N > 4) {
+ elaborate_offsets(offsets, leafN, N/2, ioffset, ooffset, stride+1, even);
+ elaborate_offsets(offsets, leafN, N/4, ioffset+(1<<stride), ooffset+N/2, stride+2, 0);
+ if(N/4 >= leafN)
+ elaborate_offsets(offsets, leafN, N/4, ioffset-(1<<stride), ooffset+3*N/4, stride+2, 0);
+ }
+
+}
+
+int compare_offsets(const void *a, const void *b) {
+ return ((ptrdiff_t *)a)[0] - ((ptrdiff_t *)b)[0];
+}
+
+void init_offsets(ffts_plan_t *p, int N, int leafN) {
+
+ ptrdiff_t *offsets = malloc(2 * N/leafN * sizeof(ptrdiff_t));
+
+ elaborate_offsets(offsets, leafN, N, 0, 0, 1, 1);
+
+ size_t i;
+ for(i=0;i<2*N/leafN;i+=2) {
+ if(offsets[i] < 0) offsets[i] = N + offsets[i];
+ }
+
+ qsort(offsets, N/leafN, 2 * sizeof(ptrdiff_t), compare_offsets);
+ //elaborate_is(p, N, 0, 0, 1);
+ p->offsets = malloc(N/leafN * sizeof(ptrdiff_t));
+ for(i=0;i<N/leafN;i++) {
+ p->offsets[i] = offsets[i*2+1]*2;
+ }
+//for(i=0;i<N/leafN;i++) {
+// printf("%td\n", p->offsets[i]);
+//}
+ free(offsets);
+
+
+}
+
+
+int tree_count(int N, int leafN, int offset) {
+
+ if(N <= leafN) return 0;
+ int count = 0;
+ count += tree_count(N/4, leafN, offset);
+ count += tree_count(N/8, leafN, offset + N/4);
+ count += tree_count(N/8, leafN, offset + N/4 + N/8);
+ count += tree_count(N/4, leafN, offset + N/2);
+ count += tree_count(N/4, leafN, offset + 3*N/4);
+
+ return 1 + count;
+}
+
+void elaborate_tree(transform_index_t **p, int N, int leafN, int offset) {
+
+ if(N <= leafN) return;
+ elaborate_tree(p, N/4, leafN, offset);
+ elaborate_tree(p, N/8, leafN, offset + N/4);
+ elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
+ elaborate_tree(p, N/4, leafN, offset + N/2);
+ elaborate_tree(p, N/4, leafN, offset + 3*N/4);
+
+ (*p)[0] = N;
+ (*p)[1] = offset*2;
+
+ (*p)+=2;
+}
+
+void init_tree(ffts_plan_t *p, int N, int leafN) {
+
+ int count = tree_count(N, leafN, 0) + 1;
+ transform_index_t *ps = p->transforms = malloc(count * 2 * sizeof(transform_index_t));
+
+//printf("count = %d\n", count);
+
+ elaborate_tree(&ps, N, leafN, 0);
+ ps[0] = 0;
+ ps[1] = 0;
+//int i;
+//for(i=0;i<count;i++) {
+// printf("%d %d - %d\n", p->transforms[i*2], p->transforms[i*2+1],
+// __builtin_ctzl(p->transforms[i*2]) - 5);
+//}
+
+}
+
diff --git a/src/patterns.h b/src/patterns.h
new file mode 100644
index 0000000..298ca3e
--- /dev/null
+++ b/src/patterns.h
@@ -0,0 +1,10 @@
+#ifndef __PATTERNS_H__
+#define __PATTERNS_H__
+
+#include "cp_sse.h"
+
+void init_is(ffts_plan_t *p, int N, int leafN, int VL);
+void init_offsets(ffts_plan_t *p, int N, int leafN);
+void init_tree(ffts_plan_t *p, int N, int leafN);
+
+#endif
OpenPOWER on IntegriCloud