diff options
author | Anthony Blake <anthonix@me.com> | 2012-08-29 18:08:30 +1200 |
---|---|---|
committer | Anthony Blake <anthonix@me.com> | 2012-08-29 18:08:30 +1200 |
commit | 5e4a32240e9ed9cb03ef51d2344ec80c615489cb (patch) | |
tree | ae0fea8ad5906bad0a2393868eeb10b0019815f9 /src | |
parent | 625f46968820cb98391d67782a9deac4504e289a (diff) | |
download | ffts-5e4a32240e9ed9cb03ef51d2344ec80c615489cb.zip ffts-5e4a32240e9ed9cb03ef51d2344ec80c615489cb.tar.gz |
SSE LEAF EE works
Diffstat (limited to 'src')
-rw-r--r-- | src/Makefile.am | 24 | ||||
-rw-r--r-- | src/Makefile.in | 309 | ||||
-rw-r--r-- | src/codegen.c | 150 | ||||
-rw-r--r-- | src/codegen.h | 2 | ||||
-rw-r--r-- | src/codegen_neon.h | 68 | ||||
-rw-r--r-- | src/codegen_sse.h | 104 | ||||
-rw-r--r-- | src/cp_sse.c | 3 | ||||
-rw-r--r-- | src/neon.h | 17 | ||||
-rw-r--r-- | src/patterns.c | 6 | ||||
-rw-r--r-- | src/sse.s | 174 |
10 files changed, 720 insertions, 137 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index 84265d6..0b0791e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,20 +1,12 @@ -OBJLIBS = libffts.a -HDRS = cp_sse.h patterns.h macros.h neon_float.h codegen.h neon.h -FILES = cp_sse.c patterns.c codegen.c neon.s -OBJS = cp_sse.o patterns.o codegen.o neon.o -all: $(OBJLIBS) +lib_LTLIBRARIES = libffts.la -%.o: %.c $(HDRS) - $(CC) $(CFLAGS) -c -o $@ $< -I../include -g - $(CC) $(CFLAGS) -S $< -I../include +libffts_la_SOURCES = cp_sse.c patterns.c codegen.c -%.o: %.s - $(CC) $(CFLAGS) -c -o $@ $< -I../include -g - -$(OBJLIBS): $(OBJS) - $(AR) rcs libffts.a $(OBJS) - -clean: - $(RM) -f *.o $(OBJLIBS) +if HAVE_NEON +libffts_la_SOURCES += neon.s +endif +if HAVE_SSE +libffts_la_SOURCES += sse.s +endif diff --git a/src/Makefile.in b/src/Makefile.in index 52c6551..3d87aa0 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -13,6 +13,7 @@ # PARTICULAR PURPOSE. @SET_MAKE@ + VPATH = @srcdir@ am__make_dryrun = \ { \ @@ -47,8 +48,13 @@ POST_INSTALL = : NORMAL_UNINSTALL = : PRE_UNINSTALL = : POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +@HAVE_NEON_TRUE@am__append_1 = neon.s +@HAVE_SSE_TRUE@am__append_2 = sse.s subdir = src -DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \ + $(top_srcdir)/depcomp ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ @@ -57,49 +63,123 @@ mkinstalldirs = $(install_sh) -d CONFIG_HEADER = $(top_builddir)/config.h CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = -SOURCES = -DIST_SOURCES = +am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; +am__vpath_adj = case $$p in \ + $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ + *) f=$$p;; \ + esac; +am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; +am__install_max = 40 +am__nobase_strip_setup = \ + srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` +am__nobase_strip = \ + for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" +am__nobase_list = $(am__nobase_strip_setup); \ + for p in $$list; do echo "$$p $$p"; done | \ + sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ + $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ + if (++n[$$2] == $(am__install_max)) \ + { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ + END { for (dir in files) print dir, files[dir] }' +am__base_list = \ + sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ + sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' +am__uninstall_files_from_dir = { \ + test -z "$$files" \ + || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \ + || { echo " ( cd '$$dir' && rm -f" $$files ")"; \ + $(am__cd) "$$dir" && rm -f $$files; }; \ + } +am__installdirs = "$(DESTDIR)$(libdir)" +LTLIBRARIES = $(lib_LTLIBRARIES) +libffts_la_LIBADD = +am__libffts_la_SOURCES_DIST = cp_sse.c patterns.c codegen.c neon.s \ + sse.s +@HAVE_NEON_TRUE@am__objects_1 = neon.lo +@HAVE_SSE_TRUE@am__objects_2 = sse.lo +am_libffts_la_OBJECTS = cp_sse.lo patterns.lo codegen.lo \ + $(am__objects_1) $(am__objects_2) +libffts_la_OBJECTS = $(am_libffts_la_OBJECTS) +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__depfiles_maybe = depfiles +am__mv = mv -f +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \ + $(LDFLAGS) -o $@ +CCASCOMPILE = $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS) +LTCCASCOMPILE = $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ + --mode=compile $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS) +SOURCES = $(libffts_la_SOURCES) +DIST_SOURCES = $(am__libffts_la_SOURCES_DIST) am__can_run_installinfo = \ case $$AM_UPDATE_INFO_DIR in \ n|no|NO) false;; \ *) (install-info --version) >/dev/null 2>&1;; \ esac +ETAGS = etags +CTAGS = ctags DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ACLOCAL = @ACLOCAL@ AMTAR = @AMTAR@ +AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ AWK = @AWK@ CC = @CC@ +CCAS = @CCAS@ +CCASDEPMODE = @CCASDEPMODE@ +CCASFLAGS = @CCASFLAGS@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CXX = @CXX@ +CXXCPP = @CXXCPP@ CXXDEPMODE = @CXXDEPMODE@ CXXFLAGS = @CXXFLAGS@ CYGPATH_W = @CYGPATH_W@ DEFS = @DEFS@ DEPDIR = @DEPDIR@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGREP = @EGREP@ EXEEXT = @EXEEXT@ +FGREP = @FGREP@ GREP = @GREP@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LD = @LD@ LDFLAGS = @LDFLAGS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ PACKAGE = @PACKAGE@ PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ PACKAGE_NAME = @PACKAGE_NAME@ @@ -108,6 +188,8 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_URL = @PACKAGE_URL@ PACKAGE_VERSION = @PACKAGE_VERSION@ PATH_SEPARATOR = @PATH_SEPARATOR@ +RANLIB = @RANLIB@ +SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ STRIP = @STRIP@ @@ -116,22 +198,32 @@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ ac_ct_CC = @ac_ct_CC@ ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ am__tar = @am__tar@ am__untar = @am__untar@ bindir = @bindir@ +build = @build@ build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ builddir = @builddir@ datadir = @datadir@ datarootdir = @datarootdir@ docdir = @docdir@ dvidir = @dvidir@ exec_prefix = @exec_prefix@ +host = @host@ host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ htmldir = @htmldir@ includedir = @includedir@ infodir = @infodir@ @@ -155,13 +247,13 @@ target_alias = @target_alias@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ -OBJLIBS = libffts.a -HDRS = cp_sse.h patterns.h macros.h neon_float.h codegen.h neon.h -FILES = cp_sse.c patterns.c codegen.c neon.s -OBJS = cp_sse.o patterns.o codegen.o neon.o +lib_LTLIBRARIES = libffts.la +libffts_la_SOURCES = cp_sse.c patterns.c codegen.c $(am__append_1) \ + $(am__append_2) all: all-am .SUFFIXES: +.SUFFIXES: .c .lo .o .obj .s $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) @for dep in $?; do \ case '$(am__configure_deps)' in \ @@ -192,14 +284,154 @@ $(top_srcdir)/configure: $(am__configure_deps) $(ACLOCAL_M4): $(am__aclocal_m4_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(am__aclocal_m4_deps): +install-libLTLIBRARIES: $(lib_LTLIBRARIES) + @$(NORMAL_INSTALL) + @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ + list2=; for p in $$list; do \ + if test -f $$p; then \ + list2="$$list2 $$p"; \ + else :; fi; \ + done; \ + test -z "$$list2" || { \ + echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \ + $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \ + echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \ + $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \ + } + +uninstall-libLTLIBRARIES: + @$(NORMAL_UNINSTALL) + @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ + for p in $$list; do \ + $(am__strip_dir) \ + echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \ + $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \ + done + +clean-libLTLIBRARIES: + -test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES) + @list='$(lib_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } +libffts.la: $(libffts_la_OBJECTS) $(libffts_la_DEPENDENCIES) $(EXTRA_libffts_la_DEPENDENCIES) + $(LINK) -rpath $(libdir) $(libffts_la_OBJECTS) $(libffts_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codegen.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cp_sse.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/patterns.Plo@am__quote@ + +.c.o: +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c $< + +.c.obj: +@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'` + +.c.lo: +@am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< +@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $< + +.s.o: + $(CCASCOMPILE) -c -o $@ $< + +.s.obj: + $(CCASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.s.lo: + $(LTCCASCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + mkid -fID $$unique tags: TAGS -TAGS: +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + set x; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi ctags: CTAGS -CTAGS: - -cscope cscopelist: +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in files) print i; }; }'`; \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" + +cscopelist: $(HEADERS) $(SOURCES) $(LISP) + list='$(SOURCES) $(HEADERS) $(LISP)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags distdir: $(DISTFILES) @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ @@ -233,8 +465,11 @@ distdir: $(DISTFILES) done check-am: all-am check: check-am -all-am: Makefile +all-am: Makefile $(LTLIBRARIES) installdirs: + for dir in "$(DESTDIR)$(libdir)"; do \ + test -z "$$dir" || $(MKDIR_P) "$$dir"; \ + done install: install-am install-exec: install-exec-am install-data: install-data-am @@ -265,11 +500,16 @@ distclean-generic: maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." -clean-am: clean-generic mostlyclean-am +clean: clean-am + +clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \ + mostlyclean-am distclean: distclean-am + -rm -rf ./$(DEPDIR) -rm -f Makefile -distclean-am: clean-am distclean-generic +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags dvi: dvi-am @@ -289,7 +529,7 @@ install-dvi: install-dvi-am install-dvi-am: -install-exec-am: +install-exec-am: install-libLTLIBRARIES install-html: install-html-am @@ -312,12 +552,14 @@ install-ps-am: installcheck-am: maintainer-clean: maintainer-clean-am + -rm -rf ./$(DEPDIR) -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic mostlyclean: mostlyclean-am -mostlyclean-am: mostlyclean-generic +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool pdf: pdf-am @@ -327,35 +569,24 @@ ps: ps-am ps-am: -uninstall-am: +uninstall-am: uninstall-libLTLIBRARIES .MAKE: install-am install-strip -.PHONY: all all-am check check-am clean clean-generic distclean \ - distclean-generic distdir dvi dvi-am html html-am info info-am \ +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ + clean-libLTLIBRARIES clean-libtool cscopelist ctags distclean \ + distclean-compile distclean-generic distclean-libtool \ + distclean-tags distdir dvi dvi-am html html-am info info-am \ install install-am install-data install-data-am install-dvi \ install-dvi-am install-exec install-exec-am install-html \ - install-html-am install-info install-info-am install-man \ - install-pdf install-pdf-am install-ps install-ps-am \ - install-strip installcheck installcheck-am installdirs \ - maintainer-clean maintainer-clean-generic mostlyclean \ - mostlyclean-generic pdf pdf-am ps ps-am uninstall uninstall-am - - -all: $(OBJLIBS) - -%.o: %.c $(HDRS) - $(CC) $(CFLAGS) -c -o $@ $< -I../include -g - $(CC) $(CFLAGS) -S $< -I../include - -%.o: %.s - $(CC) $(CFLAGS) -c -o $@ $< -I../include -g - -$(OBJLIBS): $(OBJS) - $(AR) rcs libffts.a $(OBJS) + install-html-am install-info install-info-am \ + install-libLTLIBRARIES install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags uninstall uninstall-am uninstall-libLTLIBRARIES -clean: - $(RM) -f *.o $(OBJLIBS) # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff --git a/src/codegen.c b/src/codegen.c index 681414f..68d3b3d 100644 --- a/src/codegen.c +++ b/src/codegen.c @@ -1,10 +1,19 @@ #include "codegen.h" #include "macros.h" -#include "neon_float.h" //#include "mini_macros.h" -#include "neon.h" #include "cp_sse.h" #include <libkern/OSCacheControl.h> +#include <sys/types.h> +#include <sys/mman.h> + +#ifdef __ARM_NEON__ + #include "codegen_neon.h" + #include "neon_float.h" + #include "neon.h" +#else + #include "codegen_sse.h" + #include "sse_float.h" +#endif int tree_count(int N, int leafN, int offset) { @@ -35,69 +44,7 @@ void elaborate_tree(size_t **p, int N, int leafN, int offset) { } -void neon_x4_x(data_t * restrict data, const data_t * restrict LUT) { - X_8_SPLIT(data, 64, LUT); -} - - -uint32_t BL(void *pos, void *target) { - return 0xeb000000 | (((target - pos) / 4) & 0xffffff); -} - -uint32_t B(uint8_t r) { - return 0xe12fff10 | r; -} - -uint32_t MOV(uint8_t dst, uint8_t src) { - return 0xe1a00000 | (src & 0xf) | ((dst & 0xf) << 12); -} - -void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) { - int32_t oimm = imm; - if(imm < 0) { - imm = -imm; - uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm); - if(shamt & 1) shamt -= 1; - imm >>= shamt; - shamt = (32 - shamt)/2; - - // if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm); - *(*p)++ = 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff); - - if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2)))); - - }else{ - uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm); - if(shamt & 1) shamt -= 1; - imm >>= shamt; - shamt = (32 - shamt)/2; - -// if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm); - - *(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff); - - if(imm > 255) ADDI(p, dst, src, (oimm - ((imm & 0xff) << (32-shamt*2)))); - } -} - -uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) { - return 0xe5900000 | ((dst & 0xf) << 12) - | ((base & 0xf) << 16) | (offset & 0xfff) ; -} -uint32_t MOVI(uint32_t **p, uint8_t dst, uint32_t imm) { - uint32_t oimm = imm; - - uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm); - if(shamt & 1) shamt -= 1; - imm >>= shamt; - shamt = (32 - shamt)/2; - *(*p)++ = 0xe3a00000 | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff) ; - if(imm > 255) ADDI(p, dst, dst, (oimm - ((imm & 0xff) << (32-shamt*2)))); -} - -uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; } -uint32_t POP_LR() { return 0xe8bd8ff0; } //0xe8bd8000; } uint32_t LUT_offset(size_t N, size_t leafN) { int i; @@ -130,7 +77,13 @@ uint32_t LUT_offset(size_t N, size_t leafN) { return lut_size; } -transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { +#ifdef __ARM_NEON__ + typedef uint32_t insns_t; +#else + typedef uint8_t insns_t; +#endif + +void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) { int count = tree_count(N, leafN, 0) + 1; size_t *ps = malloc(count * 2 * sizeof(size_t)); size_t *pps = ps; @@ -144,29 +97,37 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) if(N < 8192) p->transform_size = 8192; else p->transform_size = N; - p->transform_base = valloc(p->transform_size);//(void *)func; - uint32_t *func = p->transform_base;//valloc(8192); - uint32_t *fp = func; + p->transform_base = valloc(p->transform_size);//mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0); + /* + if(p->transform_base == MAP_FAILED) { + fprintf(stderr, "MAP FAILED\n"); + exit(1); + }*/ + + insns_t *func = p->transform_base;//valloc(8192); + insns_t *fp = func; fprintf(stderr, "Allocating %d bytes \n", p->transform_size); + fprintf(stderr, "Base address = %016p\n", func); if(!func) { fprintf(stderr, "NOMEM\n"); exit(1); } - uint32_t *x_8_addr = fp; + insns_t *x_8_addr = fp; memcpy(fp, neon_x8, neon_x8_t - neon_x8); fp += (neon_x8_t - neon_x8) / 4; //uint32_t *x_8_t_addr = fp; //memcpy(fp, neon_x8_t, neon_end - neon_x8_t); //fp += (neon_end - neon_x8_t) / 4; - uint32_t *x_4_addr = fp; + insns_t *x_4_addr = fp; memcpy(fp, neon_x4, neon_x8 - neon_x4); fp += (neon_x8 - neon_x4) / 4; - uint32_t *start = fp; + insns_t *start = fp; +#ifdef __ARM_NEON__ *fp++ = PUSH_LR(); ADDI(&fp, 3, 1, 0); @@ -183,20 +144,40 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) ADDI(&fp, 1, 0, 0); ADDI(&fp, 0, 2, 0), // mov out into r0 +#endif p->oe_ws = oe_w_data; p->ee_ws = ee_w_data; p->eo_ws = eo_w_data; - +#ifdef __ARM_NEON__ *fp++ = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); MOVI(&fp, 11, p->i0); +#else + *fp++ = 0x4c; + *fp++ = 0x8b; + *fp++ = 0x07; + MOVI(&fp, RCX, p->i0 * 4); + //LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p)); +#endif //fp++; - memcpy(fp, neon_ee, neon_oo - neon_ee); - fp += (neon_oo - neon_ee) / 4; + memcpy(fp, leaf_ee, neon_oo - leaf_ee); +#ifdef __ARM_NEON__ + fp += (neon_oo - leaf_ee) / 4; +#else + int i; + + IMM32_NI(fp + 3, READ_IMM32(fp + 3) + ((void *)leaf_ee - (void *)fp )); + + uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4}; + for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4); + + fp += (neon_oo - leaf_ee); +#endif +#ifdef __ARM_NEON__ if(__builtin_ctzl(N) & 1){ ADDI(&fp, 2, 7, 0); ADDI(&fp, 7, 9, 0); @@ -316,6 +297,23 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) } *fp++ = POP_LR(); count++; +#else + RET(&fp); + + + uint8_t *pp = func; + int counter = 0; + do{ + printf("%02x ", *pp); + if(counter++ % 16 == 15) printf("\n"); + } while(++pp < fp); + + printf("\n"); + + +#endif + + // *fp++ = B(14); count++; //for(int i=0;i<(neon_x8 - neon_x4)/4;i++) @@ -327,7 +325,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) { perror("Couldn't mprotect"); - return NULL; + exit(1); } sys_icache_invalidate(func, p->transform_size); @@ -335,5 +333,5 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4); - return (transform_func_t)start; + p->transform = start; } diff --git a/src/codegen.h b/src/codegen.h index f887ca0..a41735b 100644 --- a/src/codegen.h +++ b/src/codegen.h @@ -13,7 +13,7 @@ typedef struct _ffts_plan_t ffts_plan_t; typedef void (*transform_func_t)(float *data, size_t N, float *LUT); -transform_func_t ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN); +void ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN); //static const __attribute__ ((aligned(16))) float ee_w_data[4] = //{0.70710678118654757273731092936941, diff --git a/src/codegen_neon.h b/src/codegen_neon.h new file mode 100644 index 0000000..b14cb1c --- /dev/null +++ b/src/codegen_neon.h @@ -0,0 +1,68 @@ +#ifndef __CODEGEN_NEON_H__ +#define __CODEGEN_NEON_H__ + + + +uint32_t BL(void *pos, void *target) { + return 0xeb000000 | (((target - pos) / 4) & 0xffffff); +} + +uint32_t B(uint8_t r) { + return 0xe12fff10 | r; +} + +uint32_t MOV(uint8_t dst, uint8_t src) { + return 0xe1a00000 | (src & 0xf) | ((dst & 0xf) << 12); +} + +void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) { + int32_t oimm = imm; + if(imm < 0) { + imm = -imm; + uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm); + if(shamt & 1) shamt -= 1; + imm >>= shamt; + shamt = (32 - shamt)/2; + + // if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm); + *(*p)++ = 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff); + + if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2)))); + + }else{ + uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm); + if(shamt & 1) shamt -= 1; + imm >>= shamt; + shamt = (32 - shamt)/2; + +// if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm); + + *(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff); + + if(imm > 255) ADDI(p, dst, src, (oimm - ((imm & 0xff) << (32-shamt*2)))); + } +} + +uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) { + return 0xe5900000 | ((dst & 0xf) << 12) + | ((base & 0xf) << 16) | (offset & 0xfff) ; +} + +uint32_t MOVI(uint32_t **p, uint8_t dst, uint32_t imm) { + uint32_t oimm = imm; + + uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm); + if(shamt & 1) shamt -= 1; + imm >>= shamt; + shamt = (32 - shamt)/2; + *(*p)++ = 0xe3a00000 | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff) ; + if(imm > 255) ADDI(p, dst, dst, (oimm - ((imm & 0xff) << (32-shamt*2)))); +} + +uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; } +uint32_t POP_LR() { return 0xe8bd8ff0; } //0xe8bd8000; } + + + + +#endif diff --git a/src/codegen_sse.h b/src/codegen_sse.h new file mode 100644 index 0000000..d3b136a --- /dev/null +++ b/src/codegen_sse.h @@ -0,0 +1,104 @@ +#ifndef __CODEGEN_SSE_H__ +#define __CODEGEN_SSE_H__ + +static const __attribute__ ((aligned(16))) float ee_w_data[4] = {0.70710678118654757273731092936941,0.70710678118654746171500846685376, + -0.70710678118654757273731092936941,-0.70710678118654746171500846685376}; +static const __attribute__ ((aligned(16))) data_t oe_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376}; +static const __attribute__ ((aligned(16))) data_t eo_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376}; +void neon_x4(float *, size_t, float *); +void neon_x8(float *, size_t, float *); +void neon_x8_t(float *, size_t, float *); +void leaf_ee(); +void neon_oo(); +void neon_eo(); +void neon_oe(); +void neon_end(); + + +extern const uint32_t sse_leaf_ee_offsets[8]; + +#define EAX 0 +#define ECX 1 +#define EDX 2 +#define EBX 3 +#define ESI 6 +#define EDI 7 +#define EBP 5 + +#define RAX 0 +#define RCX 1 +#define RDX 2 +#define RBX 3 +#define RSI 6 +#define RDI 7 +#define RBP 5 +#define R8 8 +#define R9 9 +#define R10 10 +#define R11 11 +#define R12 12 +#define R13 13 +#define R14 14 +#define R15 15 + +void IMM8(uint8_t **p, uint32_t imm) { + *(*p)++ = (imm & 0xff); +} + +void IMM32(uint8_t **p, uint32_t imm) { + int i; + for(i=0;i<4;i++) { + *(*p)++ = (imm & (0xff << (i*8))) >> (i*8); + } +} +void IMM32_NI(uint8_t *p, uint32_t imm) { + int i; + for(i=0;i<4;i++) { + *(p+i) = (imm & (0xff << (i*8))) >> (i*8); + } +} + +uint32_t READ_IMM32(uint8_t *p) { + uint32_t rval = 0; + int i; + for(i=0;i<4;i++) { + rval |= *(p+i) << (i*8); + } + return rval; +} + +void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) { + if(dst < 8) { + *(*p)++ = 0xb8 + dst; + }else{ + *(*p)++ = 0x49; + *(*p)++ = 0xc7; + *(*p)++ = 0xc0 | (dst - 8); + } + IMM32(p, imm); +} + +void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) { + if(disp == 0) { + *(*p)++ = (rm & 7) | ((reg & 7) << 3); + }else if(disp <= 127 || disp >= -128) { + *(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3); + IMM8(p, disp); + }else{ + *(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3); + IMM32(p, disp); + } +} + +void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp) { + + *(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1); + *(*p)++ = 0x8d; + ADDRMODE(p, dst, base, disp); +} + +void RET(uint8_t **p) { + *(*p)++ = 0xc3; +} + +#endif diff --git a/src/cp_sse.c b/src/cp_sse.c index 088798b..f9a5c8f 100644 --- a/src/cp_sse.c +++ b/src/cp_sse.c @@ -308,8 +308,7 @@ ffts_plan_t *ffts_init(size_t N, int sign) { p->N = N; p->lastlut = w; p->n_luts = n_luts; - if(N>=32) - p->transform = ffts_generate_func_code(p, N, leafN); + if(N>=32) ffts_generate_func_code(p, N, leafN); // fprintf(stderr, "sizeof(size_t) == %lu\n", sizeof(size_t)); return p; diff --git a/src/neon.h b/src/neon.h new file mode 100644 index 0000000..16cb38d --- /dev/null +++ b/src/neon.h @@ -0,0 +1,17 @@ +#ifndef __NEON_H__ +#define __NEON_H__ + +static const __attribute__ ((aligned(16))) float ee_w_data[4] = {0.70710678118654757273731092936941,0.70710678118654746171500846685376, + -0.70710678118654757273731092936941,-0.70710678118654746171500846685376}; +static const __attribute__ ((aligned(16))) data_t oe_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376}; +static const __attribute__ ((aligned(16))) data_t eo_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376}; +void neon_x4(float *, size_t, float *); +void neon_x8(float *, size_t, float *); +void neon_x8_t(float *, size_t, float *); +void neon_ee(); +void neon_oo(); +void neon_eo(); +void neon_oe(); +void neon_end(); + +#endif diff --git a/src/patterns.c b/src/patterns.c index 664f20e..29fa5ae 100644 --- a/src/patterns.c +++ b/src/patterns.c @@ -114,9 +114,9 @@ void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) { for(i=0;i<N/leafN;i++) { p->offsets[i] = offsets[i*2+1]*2; } -//for(i=0;i<N/leafN;i++) { -// printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N))); -//} + for(i=0;i<N/leafN;i++) { + printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N))); + } free(offsets); diff --git a/src/sse.s b/src/sse.s new file mode 100644 index 0000000..c8e509f --- /dev/null +++ b/src/sse.s @@ -0,0 +1,174 @@ + + .globl _neon_x4 + .align 4 +_neon_x4: + + .globl _neon_x8 + .align 4 +_neon_x8: + + .globl _neon_x8_t + .align 4 +_neon_x8_t: + + + +# eax is loop counter (init to 0) +# rcx is loop max count +# rsi is 'in' base pointer +# rdx is 'out' base pointer +# r8 is offsets pointer +# r9 is constants pointer +# scratch: rax r11 r12 + .globl _leaf_ee + .align 4, 0x90 +_leaf_ee: + lea L_sse_constants(%rip), %r9 + movaps 32(%r9), %xmm0 #83.5 + movaps 0x0(%r9), %xmm8 #83.5 + xorl %eax, %eax + .align 4, 0x90 +LEAF_EE_1: +LEAF_EE_const_0: + movaps 0xBEBAFECA(%rsi,%rax,4), %xmm7 #83.5 +LEAF_EE_const_2: + movaps 0xBEBAFECA(%rsi,%rax,4), %xmm12 #83.5 + movaps %xmm7, %xmm6 #83.5 +LEAF_EE_const_3: + movaps 0xBEBAFECA(%rsi,%rax,4), %xmm10 #83.5 + movaps %xmm12, %xmm11 #83.5 + subps %xmm10, %xmm12 #83.5 + addps %xmm10, %xmm11 #83.5 + xorps %xmm8, %xmm12 #83.5 +LEAF_EE_const_1: + movaps 0xBEBAFECA(%rsi,%rax,4), %xmm9 #83.5 +LEAF_EE_const_4: + movaps 0xBEBAFECA(%rsi,%rax,4), %xmm10 #83.5 + addps %xmm9, %xmm6 #83.5 + subps %xmm9, %xmm7 #83.5 +LEAF_EE_const_5: + movaps 0xBEBAFECA(%rsi,%rax,4), %xmm13 #83.5 + movaps %xmm10, %xmm9 #83.5 +LEAF_EE_const_6: + movaps 0xBEBAFECA(%rsi,%rax,4), %xmm3 #83.5 + movaps %xmm6, %xmm5 #83.5 +LEAF_EE_const_7: + movaps 0xBEBAFECA(%rsi,%rax,4), %xmm14 #83.5 + movaps %xmm3, %xmm15 #83.5 + shufps $177, %xmm12, %xmm12 #83.5 + movaps %xmm7, %xmm4 #83.5 + movslq (%r8, %rax, 8), %r11 #83.44 + subps %xmm13, %xmm10 #83.5 + subps %xmm14, %xmm3 #83.5 + addps %xmm11, %xmm5 #83.5 + subps %xmm11, %xmm6 #83.5 + subps %xmm12, %xmm4 #83.5 + addps %xmm12, %xmm7 #83.5 + addps %xmm13, %xmm9 #83.5 + addps %xmm14, %xmm15 #83.5 + movaps 16(%r9), %xmm12 #83.5 + movaps %xmm9, %xmm1 #83.5 + movaps 16(%r9), %xmm11 #83.5 + movaps %xmm5, %xmm2 #83.5 + mulps %xmm10, %xmm12 #83.5 + subps %xmm15, %xmm9 #83.5 + addps %xmm15, %xmm1 #83.5 + mulps %xmm3, %xmm11 #83.5 + addps %xmm1, %xmm2 #83.5 + subps %xmm1, %xmm5 #83.5 + shufps $177, %xmm10, %xmm10 #83.5 + xorps %xmm8, %xmm9 #83.5 + shufps $177, %xmm3, %xmm3 #83.5 + movaps %xmm6, %xmm1 #83.5 + mulps %xmm0, %xmm10 #83.5 + movaps %xmm4, %xmm13 #83.5 + mulps %xmm0, %xmm3 #83.5 + subps %xmm10, %xmm12 #83.5 + addps %xmm3, %xmm11 #83.5 + movaps %xmm12, %xmm3 #83.5 + movaps %xmm7, %xmm14 #83.5 + shufps $177, %xmm9, %xmm9 #83.5 + subps %xmm11, %xmm12 #83.5 + addps %xmm11, %xmm3 #83.5 + subps %xmm9, %xmm1 #83.5 + addps %xmm9, %xmm6 #83.5 + addps %xmm3, %xmm4 #83.5 + subps %xmm3, %xmm13 #83.5 + xorps %xmm8, %xmm12 #83.5 + movaps %xmm2, %xmm3 #83.5 + shufps $177, %xmm12, %xmm12 #83.5 + movaps %xmm6, %xmm9 #83.5 + movslq 4(%r8, %rax, 8), %r12 #83.59 + movlhps %xmm4, %xmm3 #83.5 + addq $4, %rax + shufps $238, %xmm4, %xmm2 #83.5 + movaps %xmm1, %xmm4 #83.5 + movaps %xmm3, (%rdx,%r11,4) #83.5 + subps %xmm12, %xmm7 #83.5 + addps %xmm12, %xmm14 #83.5 + movlhps %xmm7, %xmm4 #83.5 + shufps $238, %xmm7, %xmm1 #83.5 + movaps %xmm5, %xmm7 #83.5 + movlhps %xmm13, %xmm7 #83.5 + movlhps %xmm14, %xmm9 #83.5 + shufps $238, %xmm13, %xmm5 #83.5 + shufps $238, %xmm14, %xmm6 #83.5 + movaps %xmm4, 16(%rdx,%r11,4) #83.5 + movaps %xmm7, 32(%rdx,%r11,4) #83.5 + movaps %xmm9, 48(%rdx,%r11,4) #83.5 + movaps %xmm2, (%rdx,%r12,4) #83.5 + movaps %xmm1, 16(%rdx,%r12,4) #83.5 + movaps %xmm5, 32(%rdx,%r12,4) #83.5 + movaps %xmm6, 48(%rdx,%r12,4) #83.5 + cmpq %rcx, %rax + jne LEAF_EE_1 + + .globl _neon_oo + .align 4 +_neon_oo: + + + .globl _neon_eo + .align 4 +_neon_eo: + + + .globl _neon_oe + .align 4 +_neon_oe: + + + .globl _neon_end + .align 4 +_neon_end: + + + .globl _sse_leaf_ee_offsets + .align 4 +_sse_leaf_ee_offsets: + .long LEAF_EE_const_0-_leaf_ee+0x4 + .long LEAF_EE_const_1-_leaf_ee+0x5 + .long LEAF_EE_const_2-_leaf_ee+0x5 + .long LEAF_EE_const_3-_leaf_ee+0x5 + .long LEAF_EE_const_4-_leaf_ee+0x5 + .long LEAF_EE_const_5-_leaf_ee+0x5 + .long LEAF_EE_const_6-_leaf_ee+0x4 + .long LEAF_EE_const_7-_leaf_ee+0x5 + + .section __TEXT, __const + .align 4 +L_sse_constants: +L_2il0floatpacket.719: + .long 0x00000000,0x80000000,0x00000000,0x80000000 + .align 4 +L_2il0floatpacket.720: + .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3 + .align 4 +L_2il0floatpacket.721: + .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3 + .align 4 +L_2il0floatpacket.722: + .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3 + .align 4 +L_2il0floatpacket.723: + .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3 |