diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/Makefile.am | 16 | ||||
-rw-r--r-- | src/Makefile.in | 358 | ||||
-rw-r--r-- | src/cp_sse.c | 344 | ||||
-rw-r--r-- | src/cp_sse.h | 40 | ||||
-rw-r--r-- | src/macros.h | 308 | ||||
-rw-r--r-- | src/patterns.c | 160 | ||||
-rw-r--r-- | src/patterns.h | 10 |
7 files changed, 1236 insertions, 0 deletions
diff --git a/src/Makefile.am b/src/Makefile.am new file mode 100644 index 0000000..a2ea644 --- /dev/null +++ b/src/Makefile.am @@ -0,0 +1,16 @@ + +OBJLIBS = libffts.a +HDRS = cp_sse.h patterns.h macros.h +FILES = cp_sse.c patterns.c +OBJS = cp_sse.o patterns.o + +all: $(OBJLIBS) + +%.o: %.c $(HDRS) + $(CC) $(CFLAGS) -c -o $@ $< -I../include + +$(OBJLIBS): $(OBJS) + $(AR) rcs libffts.a $(OBJS) + +clean: + $(RM) -f *.o $(OBJLIBS) diff --git a/src/Makefile.in b/src/Makefile.in new file mode 100644 index 0000000..73fecf8 --- /dev/null +++ b/src/Makefile.in @@ -0,0 +1,358 @@ +# Makefile.in generated by automake 1.12.2 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2012 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ +VPATH = @srcdir@ +am__make_dryrun = \ + { \ + am__dry=no; \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + echo 'am--echo: ; @echo "AM" OK' | $(MAKE) -f - 2>/dev/null \ + | grep '^AM OK$$' >/dev/null || am__dry=yes;; \ + *) \ + for am__flg in $$MAKEFLAGS; do \ + case $$am__flg in \ + *=*|--*) ;; \ + *n*) am__dry=yes; break;; \ + esac; \ + done;; \ + esac; \ + test $$am__dry = yes; \ + } +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +subdir = src +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +SOURCES = +DIST_SOURCES = +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +GREP = @GREP@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LTLIBOBJS = @LTLIBOBJS@ +MAKEINFO = @MAKEINFO@ +MKDIR_P = @MKDIR_P@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VERSION = @VERSION@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build_alias = @build_alias@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host_alias = @host_alias@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +OBJLIBS = libffts.a +HDRS = cp_sse.h patterns.h macros.h +FILES = cp_sse.c patterns.c +OBJS = cp_sse.o patterns.o +all: all-am + +.SUFFIXES: +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --foreign src/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): +tags: TAGS +TAGS: + +ctags: CTAGS +CTAGS: + +cscope cscopelist: + + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean-am: clean-generic mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-generic + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-generic + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: all all-am check check-am clean clean-generic distclean \ + distclean-generic distdir dvi dvi-am html html-am info info-am \ + install install-am install-data install-data-am install-dvi \ + install-dvi-am install-exec install-exec-am install-html \ + install-html-am install-info install-info-am install-man \ + install-pdf install-pdf-am install-ps install-ps-am \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-generic pdf pdf-am ps ps-am uninstall uninstall-am + + +all: $(OBJLIBS) + +%.o: %.c $(HDRS) + $(CC) $(CFLAGS) -c -o $@ $< -I../include + +$(OBJLIBS): $(OBJS) + $(AR) rcs libffts.a $(OBJS) + +clean: + $(RM) -f *.o $(OBJLIBS) + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/src/cp_sse.c b/src/cp_sse.c new file mode 100644 index 0000000..f59c60b --- /dev/null +++ b/src/cp_sse.c @@ -0,0 +1,344 @@ +#include "cp_sse.h" +#include "macros.h" +#include "patterns.h" + +__INLINE void +firstpass_type_1(const float * restrict in, float * restrict out, size_t N, ffts_plan_t * restrict p) { + size_t i, i0 = p->i0, i1 = p->i1; + size_t *offsets = (size_t *)p->offsets; + size_t *is = (size_t *)p->is; + for(i=i0;i>0;--i) LEAF_EE(&is, in, &offsets, out); + for(i=i1;i>0;--i) LEAF_OO(&is, in, &offsets, out); + LEAF_OE(&is, in, &offsets, out); + for(i=i1;i>0;--i) LEAF_EE(&is, in, &offsets, out); +} + +__INLINE void +firstpass_type_2(const float * restrict in, float * restrict out, size_t N, ffts_plan_t * restrict p) { + size_t i, i0 = p->i0, i1 = p->i1; + size_t *offsets = (size_t *)p->offsets; + size_t *is = (size_t *)p->is; + for(i=i0;i>0;--i) LEAF_EE(&is, in, &offsets, out); + LEAF_EO(&is, in, &offsets, out); + for(i=i1;i>0;--i) LEAF_OO(&is, in, &offsets, out); + for(i=i1;i>0;--i) LEAF_EE(&is, in, &offsets, out); +} + +__INLINE void +firstpass_64(const float * restrict in, float * restrict out, size_t N, ffts_plan_t * restrict p) { + size_t *offsets = (size_t *)p->offsets; + size_t *is = (size_t *)p->is; + LEAF_EE(&is, in, &offsets, out); + LEAF_OE(&is, in, &offsets, out); +} + +void +firstpass_32(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { + __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31; + + L_4_4(in+0,in+32,in+16,in+48,&r0_1,&r2_3,&r16_17,&r18_19); + L_2_2(in+8,in+40,in+56,in+24,&r4_5,&r6_7,&r20_21,&r22_23); + K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r0_1,&r2_3,&r4_5,&r6_7); + L_4_2(in+4,in+36,in+20,in+52,&r8_9,&r10_11,&r28_29,&r30_31); + L_4_4(in+60,in+28,in+12,in+44,&r12_13,&r14_15,&r24_25,&r26_27); + K_N(VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,-0,0),&r0_1,&r4_5,&r8_9,&r12_13); + K_N(VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2_3,&r6_7,&r10_11,&r14_15); + K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r16_17,&r18_19,&r20_21,&r22_23); + K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r24_25,&r26_27,&r28_29,&r30_31); + K_N(VLIT4(0.98078528040323043057924223830923,0.98078528040323043057924223830923,1,1),VLIT4(0.19509032201612824808378832130984,-0.19509032201612824808378832130984,-0,0),&r0_1,&r8_9,&r16_17,&r24_25); + S_4(r0_1,r8_9,r16_17,r24_25,out+0,out+16,out+32,out+48); + K_N(VLIT4(0.8314696123025452356714026791451,0.8314696123025452356714026791451,0.92387953251128673848313610506011,0.92387953251128673848313610506011),VLIT4(0.55557023301960217764872140833177,-0.55557023301960217764872140833177,0.38268343236508978177923268049199,-0.38268343236508978177923268049199),&r2_3,&r10_11,&r18_19,&r26_27); + S_4(r2_3,r10_11,r18_19,r26_27,out+4,out+20,out+36,out+52); + K_N(VLIT4(0.55557023301960228867102387084742,0.55557023301960228867102387084742,0.70710678118654757273731092936941,0.70710678118654757273731092936941),VLIT4(0.83146961230254512464910021662945,-0.83146961230254512464910021662945,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r4_5,&r12_13,&r20_21,&r28_29); + S_4(r4_5,r12_13,r20_21,r28_29,out+8,out+24,out+40,out+56); + K_N(VLIT4(0.19509032201612830359493955256767,0.19509032201612830359493955256767,0.38268343236508983729038391174981,0.38268343236508983729038391174981),VLIT4(0.98078528040323043057924223830923,-0.98078528040323043057924223830923,0.92387953251128673848313610506011,-0.92387953251128673848313610506011),&r6_7,&r14_15,&r22_23,&r30_31); + S_4(r6_7,r14_15,r22_23,r30_31,out+12,out+28,out+44,out+60); + +} + +void +firstpass_16(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { + __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; + + L_4_4(in+0,in+16,in+8,in+24,&r0_1,&r2_3,&r8_9,&r10_11); + L_2_4(in+4,in+20,in+28,in+12,&r4_5,&r6_7,&r14_15,&r12_13); + K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r0_1,&r2_3,&r4_5,&r6_7); + K_N(VLIT4(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),VLIT4(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,-0,0),&r0_1,&r4_5,&r8_9,&r12_13); + S_4(r0_1,r4_5,r8_9,r12_13,out+0,out+8,out+16,out+24); + K_N(VLIT4(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),VLIT4(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2_3,&r6_7,&r10_11,&r14_15); + S_4(r2_3,r6_7,r10_11,r14_15,out+4,out+12,out+20,out+28); + +} +void +firstpass_8(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { + __m128 r0_1,r2_3,r4_5,r6_7; + L_4_2(in+0,in+8,in+4,in+12,&r0_1,&r2_3,&r4_5,&r6_7); + K_N(VLIT4(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),VLIT4(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,-0,0),&r0_1,&r2_3,&r4_5,&r6_7); + S_4(r0_1,r2_3,r4_5,r6_7,out+0,out+4,out+8,out+12); +} +void +firstpass_4(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { + __m128 r0,r1,r2,r3; + L_4(in+0,in+4,in+2,in+6,&r0,&r1,&r2,&r3); + S_4(r0,r1,r2,r3,out+0,out+2,out+4,out+6); +} +void +firstpass_2(const data_t * restrict in, data_t * restrict out, size_t N, ffts_plan_t * restrict p) { + __m128 r0,r1; + L_S2(in+0,in+2,&r0,&r1); + S_2(r0,r1,out+0,out+2); +} + +void X_8(data_t * restrict data0, size_t N, const data_t * restrict LUT) { + data_t *data2 = data0 + 2*N/4; + data_t *data4 = data0 + 4*N/4; + data_t *data6 = data0 + 6*N/4; + data_t *data1 = data0 + 1*N/4; + data_t *data3 = data0 + 3*N/4; + data_t *data5 = data0 + 5*N/4; + data_t *data7 = data0 + 7*N/4; + size_t k, n4 = N/4; + + for(k=N/8/2;k>0;--k) { + __m128 r0, r1, r2, r3, r4, r5, r6, r7; + r0 = _mm_load_ps(data0); + r1 = _mm_load_ps(data1); + r2 = _mm_load_ps(data2); + r3 = _mm_load_ps(data3); + K_N(_mm_load_ps(LUT), _mm_load_ps(LUT+4), &r0, &r1, &r2, &r3); + r4 = _mm_load_ps(data4); + r6 = _mm_load_ps(data6); + K_N(_mm_load_ps(LUT+8), _mm_load_ps(LUT+12), &r0, &r2, &r4, &r6); + r5 = _mm_load_ps(data5); + r7 = _mm_load_ps(data7); + K_N(_mm_load_ps(LUT+16), _mm_load_ps(LUT+20), &r1, &r3, &r5, &r7); + LUT += 24; + _mm_store_ps(data0, r0); data0 += 4; + _mm_store_ps(data1, r1); data1 += 4; + _mm_store_ps(data2, r2); data2 += 4; + _mm_store_ps(data3, r3); data3 += 4; + _mm_store_ps(data4, r4); data4 += 4; + _mm_store_ps(data5, r5); data5 += 4; + _mm_store_ps(data6, r6); data6 += 4; + _mm_store_ps(data7, r7); data7 += 4; + } +} +void ffts_execute(ffts_plan_t *p, const void * restrict in, void * restrict out, size_t N) { + transform_index_t *ps = p->transforms; + + p->firstpass((const float *)in, (float *)out, N, p); + while(ps[0]) { + + if(ps[0] == 32) { + float *LUT = (float *)p->ws[0]; + float *data = (float *)(out) + ps[1]; + size_t n = 32; + size_t i; + for(i=0;i<n/4/2;i++) { + __m128 uk = _mm_load_ps(data); + __m128 uk2 = _mm_load_ps(data + 2*n/4); + __m128 zk_p = _mm_load_ps(data + 4*n/4); + __m128 zk_n = _mm_load_ps(data + 6*n/4); + + K_N(_mm_load_ps(LUT), _mm_load_ps(LUT+4), &uk, &uk2, &zk_p, &zk_n); + + _mm_store_ps(data, uk); + _mm_store_ps(data + 2*n/4, uk2); + _mm_store_ps(data + 4*n/4, zk_p); + _mm_store_ps(data + 6*n/4, zk_n); + + LUT += 8; + data += 4; + } + + + }else{ + int index = __builtin_ctzl(ps[0])-5; + float *LUT = (float *)p->ws[__builtin_ctzl(ps[0])-5]; + X_8(((float *)out) + ps[1], ps[0], LUT); + } + + ps += 2; + } + + +} + + +ffts_plan_t *ffts_init(size_t N) { + ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); + size_t leafN = 16; + size_t i; + + if(N > 32) { + init_offsets(p, N, leafN); + init_is(p, N, leafN, 2); + init_tree(p, N, leafN); + + if(N == 64) p->firstpass = &firstpass_64; + else if(__builtin_ctzl(N) & 1) p->firstpass = &firstpass_type_2; + else p->firstpass = &firstpass_type_1; + /* LUTS */ + + size_t n_luts = __builtin_ctzl(N/leafN); + + p->i0 = N/leafN/3+1; + p->i1 = N/leafN/3; + if((N/leafN) % 3 > 1) p->i1++; + p->i0/=2; + p->i1/=2; + + + // printf("n_luts = %zu\n", n_luts); + p->ws = malloc(n_luts * sizeof(data_t *)); + cdata_t *w; + + int n = leafN*2; + for(i=0;i<n_luts;i++) { + // printf("LUT[%zu] = %d\n", i, n); + if(!i) { + + w = _mm_malloc(n/4 * 2 * sizeof(cdata_t), 32); + + cdata_t *w0 = _mm_malloc(n/4 * sizeof(cdata_t), 32); + + size_t j; + for(j=0;j<n/4;j++) { + w0[j] = W(n,j); + } + + __m128 temp0, temp1, temp2; + + float *fw = (float *)w; + float *fw0 = (float *)w0; + for(j=0;j<n/4;j+=2) { + temp0 = _mm_load_ps(fw0 + j*2); + __m128 re, im; + re = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 2, 0, 0)); + im = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(3, 3, 1, 1)); + im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); + _mm_store_ps(fw + j*4 , re); + _mm_store_ps(fw + j*4+4, im); + } + + // for(j=0;j<n/2;j++) { + // printf("%f %f\n", creal(w[j]), cimag(w[j])); + + // } + + _mm_free(w0); + }else{ + w = _mm_malloc(n/8 * 3 * 2 * sizeof(cdata_t), 32); + + cdata_t *w0 = _mm_malloc(n/8 * sizeof(cdata_t), 32); + cdata_t *w1 = _mm_malloc(n/8 * sizeof(cdata_t), 32); + cdata_t *w2 = _mm_malloc(n/8 * sizeof(cdata_t), 32); + + size_t j; + for(j=0;j<n/8;j++) { + w0[j] = W(n,j*2); + w1[j] = W(n,j); + w2[j] = W(n,j + (n/8)); + + } + + __m128 temp0, temp1, temp2, re, im; + + float *fw = (float *)w; + float *fw0 = (float *)w0; + float *fw1 = (float *)w1; + float *fw2 = (float *)w2; + for(j=0;j<n/8;j+=2) { + temp0 = _mm_load_ps(fw0 + j*2); + re = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 2, 0, 0)); + im = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(3, 3, 1, 1)); + im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); + _mm_store_ps(fw + j*2*6 , re); + _mm_store_ps(fw + j*2*6+4, im); + + temp1 = _mm_load_ps(fw1 + j*2); + re = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(2, 2, 0, 0)); + im = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 3, 1, 1)); + im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); + _mm_store_ps(fw + j*2*6+8 , re); + _mm_store_ps(fw + j*2*6+12, im); + + temp2 = _mm_load_ps(fw2 + j*2); + re = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(2, 2, 0, 0)); + im = _mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 1, 1)); + im = _mm_xor_ps(im, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); + _mm_store_ps(fw + j*2*6+16, re); + _mm_store_ps(fw + j*2*6+20, im); + } + + _mm_free(w0); + _mm_free(w1); + _mm_free(w2); + } + p->ws[i] = w; + + n *= 2; + } + + p->n_bits = log(N)/log(2) - log(leafN*2)/log(2); + }else{ + p->transforms = malloc(2 * sizeof(transform_index_t)); + p->transforms[0] = 0; + p->transforms[1] = 1; + if(N == 2) p->firstpass = &firstpass_2; + else if(N == 4) p->firstpass = &firstpass_4; + else if(N == 8) p->firstpass = &firstpass_8; + else if(N == 16) p->firstpass = &firstpass_16; + else if(N == 32) p->firstpass = &firstpass_32; + + } + + return p; +} + +int main(int argc, char *argv[]) { + int n = atoi(argv[1]); + int count = atoi(argv[2]); + + ffts_plan_t *p = ffts_init(n); + cdata_t *in = _mm_malloc(n * sizeof(cdata_t), 32); + cdata_t *out = _mm_malloc(n * sizeof(cdata_t), 32); + + size_t i; + for(i=0;i<n;i++) in[i] = i; + + p->leaftime = 0; + + if(count>1){ + for(i=0;i<count;i++) ffts_execute(p, (const float *)in, (float *)out, n); + } + p->leaftime = 0; + + uint64_t start = mach_absolute_time(); + for(i=0;i<count;i++) ffts_execute(p, (const float *)in, (float *)out, n); + uint64_t elapsed = mach_absolute_time() - start; + + static double conversion = 0.0; + + if( conversion == 0.0 ) + { + mach_timebase_info_data_t info; + kern_return_t err = mach_timebase_info( &info ); + + //Convert the timebase into seconds + if( err == 0 ) + conversion = 1e-9 * (double) info.numer / (double) info.denom; + } + + double tt = conversion * (double) elapsed; + tt /= (double) count; + double lt = conversion * (double) (p->leaftime); +// lt /= (double) count; + for(i=0;i<n;i++) printf("%f %f\n", creal(out[i]), cimag(out[i])); + + double ctgs = 5.0f * (double)n * (log(n)/log(2)) / (tt * 1000000000.0f); + + printf("Time: %f seconds, CTGs: %f Leaftime: %f \n", tt, ctgs, lt); + + return 0; +} diff --git a/src/cp_sse.h b/src/cp_sse.h new file mode 100644 index 0000000..c6d15dc --- /dev/null +++ b/src/cp_sse.h @@ -0,0 +1,40 @@ +#ifndef __CP_SSE_H__ +#define __CP_SSE_H__ + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <complex.h> +#include <stddef.h> +#include <xmmintrin.h> +#include <stdint.h> +#include <mach/mach_time.h> + +typedef complex float cdata_t; +typedef float data_t; + +#define W(N,k) (cexp(-2.0f * M_PI * I * (float)(k) / (float)(N))) + +typedef size_t transform_index_t; + +struct _ffts_plan_t { + ptrdiff_t *is; + ptrdiff_t *offsets; + void __attribute__ ((aligned(32))) **ws; + void (*firstpass)(const float * restrict, float * restrict, size_t, struct _ffts_plan_t * restrict); + size_t i0, i1, i2; + uint64_t n_bits, leaftime; + + transform_index_t *transforms; +}; + + +typedef struct _ffts_plan_t ffts_plan_t; + + +typedef struct _split_vec_t { + __m128 re, im; +} split_vec_t; + + +#endif diff --git a/src/macros.h b/src/macros.h new file mode 100644 index 0000000..0a84802 --- /dev/null +++ b/src/macros.h @@ -0,0 +1,308 @@ +#ifndef __MACROS_H__ +#define __MACROS_H__ + +#include "cp_sse.h" + +#define __INLINE static inline __attribute__((always_inline)) + +#define VLIT4 _mm_set_ps + +__INLINE __m128 IMULI(__m128 a) { + __m128 temp = _mm_xor_ps(a, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f)); + return _mm_shuffle_ps(temp, temp, _MM_SHUFFLE(2,3,0,1)); +} + +__INLINE void +S_4(__m128 r0, __m128 r1, __m128 r2, __m128 r3, data_t * restrict o0, data_t * restrict o1, data_t * restrict o2, data_t * restrict o3) { + __m128 t0, t1, t2, t3; + _mm_store_ps(o0, r0); + _mm_store_ps(o1, r1); + _mm_store_ps(o2, r2); + _mm_store_ps(o3, r3); +} +__INLINE void S_2(__m128 r0, __m128 r1, data_t * restrict o0, data_t * restrict o1) { + _mm_store_ps(o0, r0); + _mm_store_ps(o1, r1); +} + +__INLINE void L_S2(const data_t * restrict i0, const data_t * restrict i1, __m128 * restrict r0, __m128 * restrict r1) { + __m128 t0, t1; + t0 = _mm_load_ps(i0); + t1 = _mm_load_ps(i1); + *r0 = _mm_add_ps(t0, t1); + *r1 = _mm_sub_ps(t0, t1); +} +__INLINE void +L_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, + __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { + __m128 t0, t1, t2, t3; + t0 = _mm_load_ps(i0); + t1 = _mm_load_ps(i1); + t2 = _mm_load_ps(i2); + t3 = _mm_load_ps(i3); + *r0 = _mm_add_ps (t0, t1); + *r1 = _mm_sub_ps (t0, t1); + *r2 = _mm_add_ps (t2, t3); + *r3 = _mm_sub_ps (t2, t3); +} + +__INLINE void +L_4(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, + __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { + __m128 t0, t1, t2, t3, t4, t5, t6, t7; + t0 = _mm_load_ps(i0); + t1 = _mm_load_ps(i1); + t2 = _mm_load_ps(i2); + t3 = _mm_load_ps(i3); + t4 = _mm_add_ps (t0, t1); + t5 = _mm_sub_ps (t0, t1); + t6 = _mm_add_ps (t2, t3); + t7 = IMULI(_mm_sub_ps (t2, t3)); + *r0 = _mm_add_ps (t4, t6); + *r2 = _mm_sub_ps (t4, t6); + *r1 = _mm_sub_ps (t5, t7); + *r3 = _mm_add_ps (t5, t7); +} + +__INLINE void +K_0(__m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { + __m128 uk, uk2, zk, zk_d; + uk = *r0; + uk2 = *r1; + zk = _mm_add_ps(*r2, *r3); + zk_d = IMULI(_mm_sub_ps(*r2, *r3)); + *r0 = _mm_add_ps(uk, zk); + *r2 = _mm_sub_ps(uk, zk); + *r1 = _mm_sub_ps(uk2, zk_d); + *r3 = _mm_add_ps(uk2, zk_d); +} + +__INLINE __m128 IMUL(__m128 d, __m128 re, __m128 im) { + re = _mm_mul_ps(re, d); + im = _mm_mul_ps(im, _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,3,0,1))); + return _mm_sub_ps(re, im); +} + +__INLINE __m128 IMULJ(__m128 d, __m128 re, __m128 im) { + re = _mm_mul_ps(re, d); + im = _mm_mul_ps(im, _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,3,0,1))); + return _mm_add_ps(re, im); +} + +__INLINE void +K_N(__m128 re, __m128 im, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { + __m128 uk, uk2, zk_p, zk_n, zk, zk_d; + + uk = *r0; + uk2 = *r1; + zk_p = IMUL(*r2, re, im); + zk_n = IMULJ(*r3, re, im); + + zk = _mm_add_ps(zk_p, zk_n); + zk_d = IMULI(_mm_sub_ps(zk_p, zk_n)); + + *r2 = _mm_sub_ps(uk, zk); + *r0 = _mm_add_ps(uk, zk); + *r3 = _mm_add_ps(uk2, zk_d); + *r1 = _mm_sub_ps(uk2, zk_d); +} + +__INLINE void TX2(__m128 *a, __m128 *b) { + __m128 TX2_t0 = _mm_shuffle_ps(*a, *b, _MM_SHUFFLE(1,0,1,0)); + __m128 TX2_t1 = _mm_shuffle_ps(*a, *b, _MM_SHUFFLE(3,2,3,2)); + *a = TX2_t0; *b = TX2_t1; +} + +__INLINE void +LEAF_EE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { + __m128 r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15; + + data_t *out0 = out + (*out_offsets)[0]; + data_t *out1 = out + (*out_offsets)[1]; + *out_offsets += 2; + + L_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0,&r1,&r2,&r3); + L_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4,&r5,&r6,&r7); + K_0(&r0,&r2,&r4,&r6); + K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r1,&r3,&r5,&r7); + L_4(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8,&r9,&r10,&r11); + L_4(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12,&r13,&r14,&r15); + K_0(&r0,&r4,&r8,&r12); + K_N(_mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011,0.92387953251128673848313610506011),_mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0.38268343236508978177923268049199,-0.38268343236508978177923268049199),&r1,&r5,&r9,&r13); + TX2(&r0,&r1); + TX2(&r4,&r5); + TX2(&r8,&r9); + TX2(&r12,&r13); + S_4(r0,r4,r8,r12,out0+0,out0+8,out0+16,out0+24); + S_4(r1,r5,r9,r13,out1+0,out1+8,out1+16,out1+24); + K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2,&r6,&r10,&r14); + K_N(_mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.38268343236508983729038391174981),_mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.92387953251128673848313610506011,-0.92387953251128673848313610506011),&r3,&r7,&r11,&r15); + TX2(&r2,&r3); + TX2(&r6,&r7); + TX2(&r10,&r11); + TX2(&r14,&r15); + S_4(r2,r6,r10,r14,out0+4,out0+12,out0+20,out0+28); + S_4(r3,r7,r11,r15,out1+4,out1+12,out1+20,out1+28); + *is += 16; +} + + +__INLINE void +LEAF_OO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { + __m128 r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15; + + data_t *out0 = out + (*out_offsets)[0]; + data_t *out1 = out + (*out_offsets)[1]; + *out_offsets += 2; + + L_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0,&r1,&r2,&r3); + L_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4,&r5,&r6,&r7); + K_0(&r0,&r2,&r4,&r6); + K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r1,&r3,&r5,&r7); + TX2(&r0,&r1); + TX2(&r2,&r3); + TX2(&r4,&r5); + TX2(&r6,&r7); + S_4(r0,r2,r4,r6,out0+0,out0+4,out0+8,out0+12); + S_4(r1,r3,r5,r7,out1+0,out1+4,out1+8,out1+12); + L_4(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8,&r9,&r10,&r11); + L_2(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12,&r13,&r14,&r15); + K_0(&r8,&r10,&r12,&r14); + K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r9,&r11,&r13,&r15); + TX2(&r8,&r9); + TX2(&r10,&r11); + TX2(&r12,&r13); + TX2(&r14,&r15); + S_4(r8,r10,r12,r14,out0+16,out0+20,out0+24,out0+28); + S_4(r9,r11,r13,r15,out1+16,out1+20,out1+24,out1+28); + + *is += 16; +} + +__INLINE void +L_4_4(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, + __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { + __m128 t0, t1, t2, t3, t4, t5, t6, t7; + t0 = _mm_load_ps(i0); t1 = _mm_load_ps(i1); t2 = _mm_load_ps(i2); t3 = _mm_load_ps(i3); + t4 = _mm_add_ps(t0, t1); + t5 = _mm_sub_ps(t0, t1); + t6 = _mm_add_ps(t2, t3); + t7 = IMULI(_mm_sub_ps(t2, t3)); + t0 = _mm_add_ps(t4, t6); + t2 = _mm_sub_ps(t4, t6); + t1 = _mm_sub_ps(t5, t7); + t3 = _mm_add_ps(t5, t7); + TX2(&t0,&t1); + TX2(&t2,&t3); + *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; } + +__INLINE void +L_2_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, + __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { + __m128 t0, t1, t2, t3, t4, t5, t6, t7; + t0 = _mm_load_ps(i0); t1 = _mm_load_ps(i1); t2 = _mm_load_ps(i2); t3 = _mm_load_ps(i3); t4 = _mm_add_ps(t0, t1); + t5 = _mm_sub_ps(t0, t1); + t6 = _mm_add_ps(t2, t3); + t7 = _mm_sub_ps(t2, t3); + TX2(&t4,&t5); + TX2(&t6,&t7); + *r0 = t4; *r2 = t5; *r1 = t6; *r3 = t7; +} + +__INLINE void +L_2_4(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, + __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { + __m128 t0, t1, t2, t3, t4, t5, t6, t7; + t0 = _mm_load_ps(i0); t1 = _mm_load_ps(i1); t2 = _mm_load_ps(i2); t3 = _mm_load_ps(i3); + t4 = _mm_add_ps(t0, t1); + t5 = _mm_sub_ps(t0, t1); + t6 = _mm_add_ps(t2, t3); + t7 = _mm_sub_ps(t2, t3); + *r0 = _mm_shuffle_ps(t4, t5, _MM_SHUFFLE(1,0,1,0)); + *r1 = _mm_shuffle_ps(t6, t7, _MM_SHUFFLE(1,0,1,0)); + t5 = IMULI(t5); + t0 = _mm_add_ps(t6, t4); + t2 = _mm_sub_ps(t6, t4); + t1 = _mm_sub_ps(t7, t5); + t3 = _mm_add_ps(t7, t5); + *r3 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,2,3,2)); + *r2 = _mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,2,3,2)); +} + +__INLINE void +L_4_2(const data_t * restrict i0, const data_t * restrict i1, const data_t * restrict i2, const data_t * restrict i3, + __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) { + __m128 t0, t1, t2, t3, t4, t5, t6, t7; + t0 = _mm_load_ps(i0); + t1 = _mm_load_ps(i1); + t6 = _mm_load_ps(i2); + t7 = _mm_load_ps(i3); + t2 = _mm_shuffle_ps(t6, t7, _MM_SHUFFLE(3,2,1,0)); + t3 = _mm_shuffle_ps(t7, t6, _MM_SHUFFLE(3,2,1,0)); + t4 = _mm_add_ps(t0, t1); + t5 = _mm_sub_ps(t0, t1); + t6 = _mm_add_ps(t2, t3); + t7 = _mm_sub_ps(t2, t3); + *r2 = _mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,2,3,2)); + *r3 = _mm_shuffle_ps(t6, t7, _MM_SHUFFLE(3,2,3,2)); + t7 = IMULI(t7); + t0 = _mm_add_ps(t4, t6); + t2 = _mm_sub_ps(t4, t6); + t1 = _mm_sub_ps(t5, t7); + t3 = _mm_add_ps(t5, t7); + *r0 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(1,0,1,0)); + *r1 = _mm_shuffle_ps(t2, t3, _MM_SHUFFLE(1,0,1,0)); +} + +__INLINE void +LEAF_OE(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { + __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31; + + data_t *out0 = out + (*out_offsets)[0]; + data_t *out1 = out + (*out_offsets)[1]; + *out_offsets += 2; + + L_4_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0_1,&r2_3,&r24_25,&r26_27); + L_2_4(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4_5,&r6_7,&r30_31,&r28_29); + K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r0_1,&r2_3,&r4_5,&r6_7); + S_4(r0_1,r2_3,r4_5,r6_7,out0+0,out0+4,out0+8,out0+12); + L_4_4(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8_9,&r10_11,&r16_17,&r18_19); + L_2_2(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12_13,&r14_15,&r20_21,&r22_23); + K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r8_9,&r10_11,&r12_13,&r14_15); + S_4(r8_9,r10_11,r12_13,r14_15,out0+16,out0+20,out0+24,out0+28); + K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r16_17,&r18_19,&r20_21,&r22_23); + K_N(_mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),_mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0),&r16_17,&r20_21,&r24_25,&r28_29); + S_4(r16_17,r20_21,r24_25,r28_29,out1+0,out1+8,out1+16,out1+24); + K_N(_mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r18_19,&r22_23,&r26_27,&r30_31); + S_4(r18_19,r22_23,r26_27,r30_31,out1+4,out1+12,out1+20,out1+28); + + *is += 16; +} + +__INLINE void +LEAF_EO(size_t ** restrict is, const data_t * restrict in, size_t ** restrict out_offsets, data_t * restrict out) { + __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31; + + data_t *out0 = out + (*out_offsets)[0]; + data_t *out1 = out + (*out_offsets)[1]; + *out_offsets += 2; + + L_4_4(in+(*is)[0],in+(*is)[1],in+(*is)[2],in+(*is)[3],&r0_1,&r2_3,&r16_17,&r18_19); + L_2_2(in+(*is)[4],in+(*is)[5],in+(*is)[6],in+(*is)[7],&r4_5,&r6_7,&r20_21,&r22_23); + K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r0_1,&r2_3,&r4_5,&r6_7); + L_4_2(in+(*is)[8],in+(*is)[9],in+(*is)[10],in+(*is)[11],&r8_9,&r10_11,&r28_29,&r30_31); + L_4_4(in+(*is)[12],in+(*is)[13],in+(*is)[14],in+(*is)[15],&r12_13,&r14_15,&r24_25,&r26_27); + K_N(_mm_set_ps(0.92387953251128673848313610506011,0.92387953251128673848313610506011,1,1),_mm_set_ps(0.38268343236508978177923268049199,-0.38268343236508978177923268049199,0,-0),&r0_1,&r4_5,&r8_9,&r12_13); + S_4(r0_1,r4_5,r8_9,r12_13,out0+0,out0+8,out0+16,out0+24); + K_N(_mm_set_ps(0.38268343236508983729038391174981,0.38268343236508983729038391174981,0.70710678118654757273731092936941,0.70710678118654757273731092936941),_mm_set_ps(0.92387953251128673848313610506011,-0.92387953251128673848313610506011,0.70710678118654746171500846685376,-0.70710678118654746171500846685376),&r2_3,&r6_7,&r10_11,&r14_15); + S_4(r2_3,r6_7,r10_11,r14_15,out0+4,out0+12,out0+20,out0+28); + K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r16_17,&r18_19,&r20_21,&r22_23); + S_4(r16_17,r18_19,r20_21,r22_23,out1+0,out1+4,out1+8,out1+12); + K_N(_mm_set_ps(0.70710678118654757273731092936941,0.70710678118654757273731092936941,1,1),_mm_set_ps(0.70710678118654746171500846685376,-0.70710678118654746171500846685376,0,-0),&r24_25,&r26_27,&r28_29,&r30_31); + S_4(r24_25,r26_27,r28_29,r30_31,out1+16,out1+20,out1+24,out1+28); + + *is += 16; +} + + +#endif diff --git a/src/patterns.c b/src/patterns.c new file mode 100644 index 0000000..1ab593f --- /dev/null +++ b/src/patterns.c @@ -0,0 +1,160 @@ +#include "patterns.h" + +void permute_addr(int N, int offset, int stride, int *d) { + int i, a[4] = {0,2,1,3}; + for(i=0;i<4;i++) { + d[i] = offset + (a[i] << stride); + if(d[i] < 0) d[i] += N; + } +} + +void hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offset, int stride, int even, int VL) { + + if(N > 4) { + hardcodedleaf_is_rec(is, bigN, N/2, poffset, offset, stride + 1, even, VL); + if(N/4 >= 4) hardcodedleaf_is_rec(is, bigN, N/4, poffset+(1<<stride),offset+(N/2), stride + 2, 0, VL); + if(N/4 >= 4) hardcodedleaf_is_rec(is, bigN, N/4, poffset-(1<<stride),offset+(3*N/4), stride + 2, 0, VL); + else { + int temp = poffset+(1<<stride); + if(temp < 0) temp += bigN; + temp *= 2; + + if(!(temp % (VL*2))) { + (*is)[0] = poffset+(1<<stride); + (*is)[1] = poffset+(1<<stride)+(1<<(stride+2)); + (*is)[2] = poffset-(1<<stride); + (*is)[3] = poffset-(1<<stride)+(1<<(stride+2)); + int i; + for(i=0;i<4;i++) if((*is)[i] < 0) (*is)[i] += bigN; + for(i=0;i<4;i++) (*is)[i] *= 2; + *is += 4; + } + } + }else if(N == 4) { + int perm[4]; + permute_addr(bigN, poffset, stride, perm); + if(!((perm[0]*2) % (VL*2))) { + int i; + for(i=0;i<4;i++) { + (*is)[i] = perm[i] * 2; + } + *is += 4; + } + } +} + +void init_is(ffts_plan_t *p, int N, int leafN, int VL) { + int i, i0 = N/leafN/3+1, i1=N/leafN/3, i2 = N/leafN/3; + int stride = log(N/leafN)/log(2); + + p->is = malloc(N/VL * sizeof(ptrdiff_t)); + + ptrdiff_t *is = p->is; + + if((N/leafN) % 3 > 1) i1++; + + for(i=0;i<i0;i++) hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL); + for(i=i0;i<i0+i1;i++) { + hardcodedleaf_is_rec(&is, N, leafN/2, i, 0, stride+1, 1, VL); + hardcodedleaf_is_rec(&is, N, leafN/2, i-(1<<stride), 0, stride+1, 1, VL); + } + for(i=0-i2;i<0;i++) hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL); + + +//for(i=0;i<N/VL;i++) { +// printf("%td ", p->is[i]); +// if(i % 16 == 15) printf("\n"); +//} + + p->i0 = i0; p->i1 = i1; p->i2 = i2; +} + +void elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) { + if((even && N == leafN) || (!even && N <= leafN)) { + offsets[2*(ooffset/leafN)] = ioffset*2; + offsets[2*(ooffset/leafN)+1] = ooffset; + }else if(N > 4) { + elaborate_offsets(offsets, leafN, N/2, ioffset, ooffset, stride+1, even); + elaborate_offsets(offsets, leafN, N/4, ioffset+(1<<stride), ooffset+N/2, stride+2, 0); + if(N/4 >= leafN) + elaborate_offsets(offsets, leafN, N/4, ioffset-(1<<stride), ooffset+3*N/4, stride+2, 0); + } + +} + +int compare_offsets(const void *a, const void *b) { + return ((ptrdiff_t *)a)[0] - ((ptrdiff_t *)b)[0]; +} + +void init_offsets(ffts_plan_t *p, int N, int leafN) { + + ptrdiff_t *offsets = malloc(2 * N/leafN * sizeof(ptrdiff_t)); + + elaborate_offsets(offsets, leafN, N, 0, 0, 1, 1); + + size_t i; + for(i=0;i<2*N/leafN;i+=2) { + if(offsets[i] < 0) offsets[i] = N + offsets[i]; + } + + qsort(offsets, N/leafN, 2 * sizeof(ptrdiff_t), compare_offsets); + //elaborate_is(p, N, 0, 0, 1); + p->offsets = malloc(N/leafN * sizeof(ptrdiff_t)); + for(i=0;i<N/leafN;i++) { + p->offsets[i] = offsets[i*2+1]*2; + } +//for(i=0;i<N/leafN;i++) { +// printf("%td\n", p->offsets[i]); +//} + free(offsets); + + +} + + +int tree_count(int N, int leafN, int offset) { + + if(N <= leafN) return 0; + int count = 0; + count += tree_count(N/4, leafN, offset); + count += tree_count(N/8, leafN, offset + N/4); + count += tree_count(N/8, leafN, offset + N/4 + N/8); + count += tree_count(N/4, leafN, offset + N/2); + count += tree_count(N/4, leafN, offset + 3*N/4); + + return 1 + count; +} + +void elaborate_tree(transform_index_t **p, int N, int leafN, int offset) { + + if(N <= leafN) return; + elaborate_tree(p, N/4, leafN, offset); + elaborate_tree(p, N/8, leafN, offset + N/4); + elaborate_tree(p, N/8, leafN, offset + N/4 + N/8); + elaborate_tree(p, N/4, leafN, offset + N/2); + elaborate_tree(p, N/4, leafN, offset + 3*N/4); + + (*p)[0] = N; + (*p)[1] = offset*2; + + (*p)+=2; +} + +void init_tree(ffts_plan_t *p, int N, int leafN) { + + int count = tree_count(N, leafN, 0) + 1; + transform_index_t *ps = p->transforms = malloc(count * 2 * sizeof(transform_index_t)); + +//printf("count = %d\n", count); + + elaborate_tree(&ps, N, leafN, 0); + ps[0] = 0; + ps[1] = 0; +//int i; +//for(i=0;i<count;i++) { +// printf("%d %d - %d\n", p->transforms[i*2], p->transforms[i*2+1], +// __builtin_ctzl(p->transforms[i*2]) - 5); +//} + +} + diff --git a/src/patterns.h b/src/patterns.h new file mode 100644 index 0000000..298ca3e --- /dev/null +++ b/src/patterns.h @@ -0,0 +1,10 @@ +#ifndef __PATTERNS_H__ +#define __PATTERNS_H__ + +#include "cp_sse.h" + +void init_is(ffts_plan_t *p, int N, int leafN, int VL); +void init_offsets(ffts_plan_t *p, int N, int leafN); +void init_tree(ffts_plan_t *p, int N, int leafN); + +#endif |