summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorAnthony Blake <anthonix@me.com>2012-08-29 18:08:30 +1200
committerAnthony Blake <anthonix@me.com>2012-08-29 18:08:30 +1200
commit5e4a32240e9ed9cb03ef51d2344ec80c615489cb (patch)
treeae0fea8ad5906bad0a2393868eeb10b0019815f9 /src
parent625f46968820cb98391d67782a9deac4504e289a (diff)
downloadffts-5e4a32240e9ed9cb03ef51d2344ec80c615489cb.zip
ffts-5e4a32240e9ed9cb03ef51d2344ec80c615489cb.tar.gz
SSE LEAF EE works
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.am24
-rw-r--r--src/Makefile.in309
-rw-r--r--src/codegen.c150
-rw-r--r--src/codegen.h2
-rw-r--r--src/codegen_neon.h68
-rw-r--r--src/codegen_sse.h104
-rw-r--r--src/cp_sse.c3
-rw-r--r--src/neon.h17
-rw-r--r--src/patterns.c6
-rw-r--r--src/sse.s174
10 files changed, 720 insertions, 137 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 84265d6..0b0791e 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,20 +1,12 @@
-OBJLIBS = libffts.a
-HDRS = cp_sse.h patterns.h macros.h neon_float.h codegen.h neon.h
-FILES = cp_sse.c patterns.c codegen.c neon.s
-OBJS = cp_sse.o patterns.o codegen.o neon.o
-all: $(OBJLIBS)
+lib_LTLIBRARIES = libffts.la
-%.o: %.c $(HDRS)
- $(CC) $(CFLAGS) -c -o $@ $< -I../include -g
- $(CC) $(CFLAGS) -S $< -I../include
+libffts_la_SOURCES = cp_sse.c patterns.c codegen.c
-%.o: %.s
- $(CC) $(CFLAGS) -c -o $@ $< -I../include -g
-
-$(OBJLIBS): $(OBJS)
- $(AR) rcs libffts.a $(OBJS)
-
-clean:
- $(RM) -f *.o $(OBJLIBS)
+if HAVE_NEON
+libffts_la_SOURCES += neon.s
+endif
+if HAVE_SSE
+libffts_la_SOURCES += sse.s
+endif
diff --git a/src/Makefile.in b/src/Makefile.in
index 52c6551..3d87aa0 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -13,6 +13,7 @@
# PARTICULAR PURPOSE.
@SET_MAKE@
+
VPATH = @srcdir@
am__make_dryrun = \
{ \
@@ -47,8 +48,13 @@ POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+@HAVE_NEON_TRUE@am__append_1 = neon.s
+@HAVE_SSE_TRUE@am__append_2 = sse.s
subdir = src
-DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
+DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+ $(top_srcdir)/depcomp
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
@@ -57,49 +63,123 @@ mkinstalldirs = $(install_sh) -d
CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
-SOURCES =
-DIST_SOURCES =
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+ *) f=$$p;; \
+ esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+ for p in $$list; do echo "$$p $$p"; done | \
+ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+ if (++n[$$2] == $(am__install_max)) \
+ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+ END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+ test -z "$$files" \
+ || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+ || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+ $(am__cd) "$$dir" && rm -f $$files; }; \
+ }
+am__installdirs = "$(DESTDIR)$(libdir)"
+LTLIBRARIES = $(lib_LTLIBRARIES)
+libffts_la_LIBADD =
+am__libffts_la_SOURCES_DIST = cp_sse.c patterns.c codegen.c neon.s \
+ sse.s
+@HAVE_NEON_TRUE@am__objects_1 = neon.lo
+@HAVE_SSE_TRUE@am__objects_2 = sse.lo
+am_libffts_la_OBJECTS = cp_sse.lo patterns.lo codegen.lo \
+ $(am__objects_1) $(am__objects_2)
+libffts_la_OBJECTS = $(am_libffts_la_OBJECTS)
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+ --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+ --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+ $(LDFLAGS) -o $@
+CCASCOMPILE = $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS)
+LTCCASCOMPILE = $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
+ --mode=compile $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS)
+SOURCES = $(libffts_la_SOURCES)
+DIST_SOURCES = $(am__libffts_la_SOURCES_DIST)
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
+ETAGS = etags
+CTAGS = ctags
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
ACLOCAL = @ACLOCAL@
AMTAR = @AMTAR@
+AR = @AR@
AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
AUTOMAKE = @AUTOMAKE@
AWK = @AWK@
CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
CCDEPMODE = @CCDEPMODE@
CFLAGS = @CFLAGS@
CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
CXX = @CXX@
+CXXCPP = @CXXCPP@
CXXDEPMODE = @CXXDEPMODE@
CXXFLAGS = @CXXFLAGS@
CYGPATH_W = @CYGPATH_W@
DEFS = @DEFS@
DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
ECHO_C = @ECHO_C@
ECHO_N = @ECHO_N@
ECHO_T = @ECHO_T@
EGREP = @EGREP@
EXEEXT = @EXEEXT@
+FGREP = @FGREP@
GREP = @GREP@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
LDFLAGS = @LDFLAGS@
LIBOBJS = @LIBOBJS@
LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
LTLIBOBJS = @LTLIBOBJS@
MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
OBJEXT = @OBJEXT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
PACKAGE = @PACKAGE@
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
PACKAGE_NAME = @PACKAGE_NAME@
@@ -108,6 +188,8 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
+RANLIB = @RANLIB@
+SED = @SED@
SET_MAKE = @SET_MAKE@
SHELL = @SHELL@
STRIP = @STRIP@
@@ -116,22 +198,32 @@ abs_builddir = @abs_builddir@
abs_srcdir = @abs_srcdir@
abs_top_builddir = @abs_top_builddir@
abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
ac_ct_CC = @ac_ct_CC@
ac_ct_CXX = @ac_ct_CXX@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
am__include = @am__include@
am__leading_dot = @am__leading_dot@
am__quote = @am__quote@
am__tar = @am__tar@
am__untar = @am__untar@
bindir = @bindir@
+build = @build@
build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
builddir = @builddir@
datadir = @datadir@
datarootdir = @datarootdir@
docdir = @docdir@
dvidir = @dvidir@
exec_prefix = @exec_prefix@
+host = @host@
host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
htmldir = @htmldir@
includedir = @includedir@
infodir = @infodir@
@@ -155,13 +247,13 @@ target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
-OBJLIBS = libffts.a
-HDRS = cp_sse.h patterns.h macros.h neon_float.h codegen.h neon.h
-FILES = cp_sse.c patterns.c codegen.c neon.s
-OBJS = cp_sse.o patterns.o codegen.o neon.o
+lib_LTLIBRARIES = libffts.la
+libffts_la_SOURCES = cp_sse.c patterns.c codegen.c $(am__append_1) \
+ $(am__append_2)
all: all-am
.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj .s
$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
@for dep in $?; do \
case '$(am__configure_deps)' in \
@@ -192,14 +284,154 @@ $(top_srcdir)/configure: $(am__configure_deps)
$(ACLOCAL_M4): $(am__aclocal_m4_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(am__aclocal_m4_deps):
+install-libLTLIBRARIES: $(lib_LTLIBRARIES)
+ @$(NORMAL_INSTALL)
+ @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+ list2=; for p in $$list; do \
+ if test -f $$p; then \
+ list2="$$list2 $$p"; \
+ else :; fi; \
+ done; \
+ test -z "$$list2" || { \
+ echo " $(MKDIR_P) '$(DESTDIR)$(libdir)'"; \
+ $(MKDIR_P) "$(DESTDIR)$(libdir)" || exit 1; \
+ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \
+ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \
+ }
+
+uninstall-libLTLIBRARIES:
+ @$(NORMAL_UNINSTALL)
+ @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
+ for p in $$list; do \
+ $(am__strip_dir) \
+ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \
+ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \
+ done
+
+clean-libLTLIBRARIES:
+ -test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
+ @list='$(lib_LTLIBRARIES)'; \
+ locs=`for p in $$list; do echo $$p; done | \
+ sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+ sort -u`; \
+ test -z "$$locs" || { \
+ echo rm -f $${locs}; \
+ rm -f $${locs}; \
+ }
+libffts.la: $(libffts_la_OBJECTS) $(libffts_la_DEPENDENCIES) $(EXTRA_libffts_la_DEPENDENCIES)
+ $(LINK) -rpath $(libdir) $(libffts_la_OBJECTS) $(libffts_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+ -rm -f *.$(OBJEXT)
+
+distclean-compile:
+ -rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codegen.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cp_sse.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/patterns.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(COMPILE) -c `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@ $(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@ $(LTCOMPILE) -c -o $@ $<
+
+.s.o:
+ $(CCASCOMPILE) -c -o $@ $<
+
+.s.obj:
+ $(CCASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.s.lo:
+ $(LTCCASCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+ -rm -f *.lo
+
+clean-libtool:
+ -rm -rf .libs _libs
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+ END { if (nonempty) { for (i in files) print i; }; }'`; \
+ mkid -fID $$unique
tags: TAGS
-TAGS:
+TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \
+ $(TAGS_FILES) $(LISP)
+ set x; \
+ here=`pwd`; \
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+ END { if (nonempty) { for (i in files) print i; }; }'`; \
+ shift; \
+ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+ test -n "$$unique" || unique=$$empty_fix; \
+ if test $$# -gt 0; then \
+ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+ "$$@" $$unique; \
+ else \
+ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+ $$unique; \
+ fi; \
+ fi
ctags: CTAGS
-CTAGS:
-
-cscope cscopelist:
+CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \
+ $(TAGS_FILES) $(LISP)
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+ unique=`for i in $$list; do \
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+ done | \
+ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
+ END { if (nonempty) { for (i in files) print i; }; }'`; \
+ test -z "$(CTAGS_ARGS)$$unique" \
+ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+ $$unique
+
+GTAGS:
+ here=`$(am__cd) $(top_builddir) && pwd` \
+ && $(am__cd) $(top_srcdir) \
+ && gtags -i $(GTAGS_ARGS) "$$here"
+
+cscopelist: $(HEADERS) $(SOURCES) $(LISP)
+ list='$(SOURCES) $(HEADERS) $(LISP)'; \
+ case "$(srcdir)" in \
+ [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+ *) sdir=$(subdir)/$(srcdir) ;; \
+ esac; \
+ for i in $$list; do \
+ if test -f "$$i"; then \
+ echo "$(subdir)/$$i"; \
+ else \
+ echo "$$sdir/$$i"; \
+ fi; \
+ done >> $(top_builddir)/cscope.files
+distclean-tags:
+ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
distdir: $(DISTFILES)
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
@@ -233,8 +465,11 @@ distdir: $(DISTFILES)
done
check-am: all-am
check: check-am
-all-am: Makefile
+all-am: Makefile $(LTLIBRARIES)
installdirs:
+ for dir in "$(DESTDIR)$(libdir)"; do \
+ test -z "$$dir" || $(MKDIR_P) "$$dir"; \
+ done
install: install-am
install-exec: install-exec-am
install-data: install-data-am
@@ -265,11 +500,16 @@ distclean-generic:
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
-clean-am: clean-generic mostlyclean-am
+clean: clean-am
+
+clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
+ mostlyclean-am
distclean: distclean-am
+ -rm -rf ./$(DEPDIR)
-rm -f Makefile
-distclean-am: clean-am distclean-generic
+distclean-am: clean-am distclean-compile distclean-generic \
+ distclean-tags
dvi: dvi-am
@@ -289,7 +529,7 @@ install-dvi: install-dvi-am
install-dvi-am:
-install-exec-am:
+install-exec-am: install-libLTLIBRARIES
install-html: install-html-am
@@ -312,12 +552,14 @@ install-ps-am:
installcheck-am:
maintainer-clean: maintainer-clean-am
+ -rm -rf ./$(DEPDIR)
-rm -f Makefile
maintainer-clean-am: distclean-am maintainer-clean-generic
mostlyclean: mostlyclean-am
-mostlyclean-am: mostlyclean-generic
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+ mostlyclean-libtool
pdf: pdf-am
@@ -327,35 +569,24 @@ ps: ps-am
ps-am:
-uninstall-am:
+uninstall-am: uninstall-libLTLIBRARIES
.MAKE: install-am install-strip
-.PHONY: all all-am check check-am clean clean-generic distclean \
- distclean-generic distdir dvi dvi-am html html-am info info-am \
+.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
+ clean-libLTLIBRARIES clean-libtool cscopelist ctags distclean \
+ distclean-compile distclean-generic distclean-libtool \
+ distclean-tags distdir dvi dvi-am html html-am info info-am \
install install-am install-data install-data-am install-dvi \
install-dvi-am install-exec install-exec-am install-html \
- install-html-am install-info install-info-am install-man \
- install-pdf install-pdf-am install-ps install-ps-am \
- install-strip installcheck installcheck-am installdirs \
- maintainer-clean maintainer-clean-generic mostlyclean \
- mostlyclean-generic pdf pdf-am ps ps-am uninstall uninstall-am
-
-
-all: $(OBJLIBS)
-
-%.o: %.c $(HDRS)
- $(CC) $(CFLAGS) -c -o $@ $< -I../include -g
- $(CC) $(CFLAGS) -S $< -I../include
-
-%.o: %.s
- $(CC) $(CFLAGS) -c -o $@ $< -I../include -g
-
-$(OBJLIBS): $(OBJS)
- $(AR) rcs libffts.a $(OBJS)
+ install-html-am install-info install-info-am \
+ install-libLTLIBRARIES install-man install-pdf install-pdf-am \
+ install-ps install-ps-am install-strip installcheck \
+ installcheck-am installdirs maintainer-clean \
+ maintainer-clean-generic mostlyclean mostlyclean-compile \
+ mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+ tags uninstall uninstall-am uninstall-libLTLIBRARIES
-clean:
- $(RM) -f *.o $(OBJLIBS)
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
diff --git a/src/codegen.c b/src/codegen.c
index 681414f..68d3b3d 100644
--- a/src/codegen.c
+++ b/src/codegen.c
@@ -1,10 +1,19 @@
#include "codegen.h"
#include "macros.h"
-#include "neon_float.h"
//#include "mini_macros.h"
-#include "neon.h"
#include "cp_sse.h"
#include <libkern/OSCacheControl.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+
+#ifdef __ARM_NEON__
+ #include "codegen_neon.h"
+ #include "neon_float.h"
+ #include "neon.h"
+#else
+ #include "codegen_sse.h"
+ #include "sse_float.h"
+#endif
int tree_count(int N, int leafN, int offset) {
@@ -35,69 +44,7 @@ void elaborate_tree(size_t **p, int N, int leafN, int offset) {
}
-void neon_x4_x(data_t * restrict data, const data_t * restrict LUT) {
- X_8_SPLIT(data, 64, LUT);
-}
-
-
-uint32_t BL(void *pos, void *target) {
- return 0xeb000000 | (((target - pos) / 4) & 0xffffff);
-}
-
-uint32_t B(uint8_t r) {
- return 0xe12fff10 | r;
-}
-
-uint32_t MOV(uint8_t dst, uint8_t src) {
- return 0xe1a00000 | (src & 0xf) | ((dst & 0xf) << 12);
-}
-
-void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) {
- int32_t oimm = imm;
- if(imm < 0) {
- imm = -imm;
- uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm);
- if(shamt & 1) shamt -= 1;
- imm >>= shamt;
- shamt = (32 - shamt)/2;
-
- // if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
- *(*p)++ = 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
-
- if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2))));
-
- }else{
- uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm);
- if(shamt & 1) shamt -= 1;
- imm >>= shamt;
- shamt = (32 - shamt)/2;
-
-// if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
-
- *(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
-
- if(imm > 255) ADDI(p, dst, src, (oimm - ((imm & 0xff) << (32-shamt*2))));
- }
-}
-
-uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) {
- return 0xe5900000 | ((dst & 0xf) << 12)
- | ((base & 0xf) << 16) | (offset & 0xfff) ;
-}
-uint32_t MOVI(uint32_t **p, uint8_t dst, uint32_t imm) {
- uint32_t oimm = imm;
-
- uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm);
- if(shamt & 1) shamt -= 1;
- imm >>= shamt;
- shamt = (32 - shamt)/2;
- *(*p)++ = 0xe3a00000 | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff) ;
- if(imm > 255) ADDI(p, dst, dst, (oimm - ((imm & 0xff) << (32-shamt*2))));
-}
-
-uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; }
-uint32_t POP_LR() { return 0xe8bd8ff0; } //0xe8bd8000; }
uint32_t LUT_offset(size_t N, size_t leafN) {
int i;
@@ -130,7 +77,13 @@ uint32_t LUT_offset(size_t N, size_t leafN) {
return lut_size;
}
-transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) {
+#ifdef __ARM_NEON__
+ typedef uint32_t insns_t;
+#else
+ typedef uint8_t insns_t;
+#endif
+
+void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN) {
int count = tree_count(N, leafN, 0) + 1;
size_t *ps = malloc(count * 2 * sizeof(size_t));
size_t *pps = ps;
@@ -144,29 +97,37 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
if(N < 8192) p->transform_size = 8192;
else p->transform_size = N;
- p->transform_base = valloc(p->transform_size);//(void *)func;
- uint32_t *func = p->transform_base;//valloc(8192);
- uint32_t *fp = func;
+ p->transform_base = valloc(p->transform_size);//mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0);
+ /*
+ if(p->transform_base == MAP_FAILED) {
+ fprintf(stderr, "MAP FAILED\n");
+ exit(1);
+ }*/
+
+ insns_t *func = p->transform_base;//valloc(8192);
+ insns_t *fp = func;
fprintf(stderr, "Allocating %d bytes \n", p->transform_size);
+ fprintf(stderr, "Base address = %016p\n", func);
if(!func) {
fprintf(stderr, "NOMEM\n");
exit(1);
}
- uint32_t *x_8_addr = fp;
+ insns_t *x_8_addr = fp;
memcpy(fp, neon_x8, neon_x8_t - neon_x8);
fp += (neon_x8_t - neon_x8) / 4;
//uint32_t *x_8_t_addr = fp;
//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
//fp += (neon_end - neon_x8_t) / 4;
- uint32_t *x_4_addr = fp;
+ insns_t *x_4_addr = fp;
memcpy(fp, neon_x4, neon_x8 - neon_x4);
fp += (neon_x8 - neon_x4) / 4;
- uint32_t *start = fp;
+ insns_t *start = fp;
+#ifdef __ARM_NEON__
*fp++ = PUSH_LR();
ADDI(&fp, 3, 1, 0);
@@ -183,20 +144,40 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
ADDI(&fp, 1, 0, 0);
ADDI(&fp, 0, 2, 0), // mov out into r0
+#endif
p->oe_ws = oe_w_data;
p->ee_ws = ee_w_data;
p->eo_ws = eo_w_data;
-
+#ifdef __ARM_NEON__
*fp++ = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
MOVI(&fp, 11, p->i0);
+#else
+ *fp++ = 0x4c;
+ *fp++ = 0x8b;
+ *fp++ = 0x07;
+ MOVI(&fp, RCX, p->i0 * 4);
+ //LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p));
+#endif
//fp++;
- memcpy(fp, neon_ee, neon_oo - neon_ee);
- fp += (neon_oo - neon_ee) / 4;
+ memcpy(fp, leaf_ee, neon_oo - leaf_ee);
+#ifdef __ARM_NEON__
+ fp += (neon_oo - leaf_ee) / 4;
+#else
+ int i;
+
+ IMM32_NI(fp + 3, READ_IMM32(fp + 3) + ((void *)leaf_ee - (void *)fp ));
+
+ uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
+ for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4);
+
+ fp += (neon_oo - leaf_ee);
+#endif
+#ifdef __ARM_NEON__
if(__builtin_ctzl(N) & 1){
ADDI(&fp, 2, 7, 0);
ADDI(&fp, 7, 9, 0);
@@ -316,6 +297,23 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
}
*fp++ = POP_LR(); count++;
+#else
+ RET(&fp);
+
+
+ uint8_t *pp = func;
+ int counter = 0;
+ do{
+ printf("%02x ", *pp);
+ if(counter++ % 16 == 15) printf("\n");
+ } while(++pp < fp);
+
+ printf("\n");
+
+
+#endif
+
+
// *fp++ = B(14); count++;
//for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
@@ -327,7 +325,7 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) {
perror("Couldn't mprotect");
- return NULL;
+ exit(1);
}
sys_icache_invalidate(func, p->transform_size);
@@ -335,5 +333,5 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN)
fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4);
- return (transform_func_t)start;
+ p->transform = start;
}
diff --git a/src/codegen.h b/src/codegen.h
index f887ca0..a41735b 100644
--- a/src/codegen.h
+++ b/src/codegen.h
@@ -13,7 +13,7 @@ typedef struct _ffts_plan_t ffts_plan_t;
typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
-transform_func_t ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN);
+void ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN);
//static const __attribute__ ((aligned(16))) float ee_w_data[4] =
//{0.70710678118654757273731092936941,
diff --git a/src/codegen_neon.h b/src/codegen_neon.h
new file mode 100644
index 0000000..b14cb1c
--- /dev/null
+++ b/src/codegen_neon.h
@@ -0,0 +1,68 @@
+#ifndef __CODEGEN_NEON_H__
+#define __CODEGEN_NEON_H__
+
+
+
+uint32_t BL(void *pos, void *target) {
+ return 0xeb000000 | (((target - pos) / 4) & 0xffffff);
+}
+
+uint32_t B(uint8_t r) {
+ return 0xe12fff10 | r;
+}
+
+uint32_t MOV(uint8_t dst, uint8_t src) {
+ return 0xe1a00000 | (src & 0xf) | ((dst & 0xf) << 12);
+}
+
+void ADDI(uint32_t **p, uint8_t dst, uint8_t src, int32_t imm) {
+ int32_t oimm = imm;
+ if(imm < 0) {
+ imm = -imm;
+ uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm);
+ if(shamt & 1) shamt -= 1;
+ imm >>= shamt;
+ shamt = (32 - shamt)/2;
+
+ // if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
+ *(*p)++ = 0xe2400000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
+
+ if(imm > 255) ADDI(p, dst, src, (oimm + ((imm & 0xff) << (32-shamt*2))));
+
+ }else{
+ uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm);
+ if(shamt & 1) shamt -= 1;
+ imm >>= shamt;
+ shamt = (32 - shamt)/2;
+
+// if(imm > 255) fprintf(stderr, "imm>255: %d\n", oimm);
+
+ *(*p)++ = 0xe2800000 | ((src & 0xf) << 16) | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff);
+
+ if(imm > 255) ADDI(p, dst, src, (oimm - ((imm & 0xff) << (32-shamt*2))));
+ }
+}
+
+uint32_t LDRI(uint8_t dst, uint8_t base, uint32_t offset) {
+ return 0xe5900000 | ((dst & 0xf) << 12)
+ | ((base & 0xf) << 16) | (offset & 0xfff) ;
+}
+
+uint32_t MOVI(uint32_t **p, uint8_t dst, uint32_t imm) {
+ uint32_t oimm = imm;
+
+ uint32_t shamt = (__builtin_ctzl(imm)>15)?15:__builtin_ctzl(imm);
+ if(shamt & 1) shamt -= 1;
+ imm >>= shamt;
+ shamt = (32 - shamt)/2;
+ *(*p)++ = 0xe3a00000 | ((dst & 0xf) << 12) | ((shamt & 0xf) << 8) | (imm & 0xff) ;
+ if(imm > 255) ADDI(p, dst, dst, (oimm - ((imm & 0xff) << (32-shamt*2))));
+}
+
+uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; }
+uint32_t POP_LR() { return 0xe8bd8ff0; } //0xe8bd8000; }
+
+
+
+
+#endif
diff --git a/src/codegen_sse.h b/src/codegen_sse.h
new file mode 100644
index 0000000..d3b136a
--- /dev/null
+++ b/src/codegen_sse.h
@@ -0,0 +1,104 @@
+#ifndef __CODEGEN_SSE_H__
+#define __CODEGEN_SSE_H__
+
+static const __attribute__ ((aligned(16))) float ee_w_data[4] = {0.70710678118654757273731092936941,0.70710678118654746171500846685376,
+ -0.70710678118654757273731092936941,-0.70710678118654746171500846685376};
+static const __attribute__ ((aligned(16))) data_t oe_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376};
+static const __attribute__ ((aligned(16))) data_t eo_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376};
+void neon_x4(float *, size_t, float *);
+void neon_x8(float *, size_t, float *);
+void neon_x8_t(float *, size_t, float *);
+void leaf_ee();
+void neon_oo();
+void neon_eo();
+void neon_oe();
+void neon_end();
+
+
+extern const uint32_t sse_leaf_ee_offsets[8];
+
+#define EAX 0
+#define ECX 1
+#define EDX 2
+#define EBX 3
+#define ESI 6
+#define EDI 7
+#define EBP 5
+
+#define RAX 0
+#define RCX 1
+#define RDX 2
+#define RBX 3
+#define RSI 6
+#define RDI 7
+#define RBP 5
+#define R8 8
+#define R9 9
+#define R10 10
+#define R11 11
+#define R12 12
+#define R13 13
+#define R14 14
+#define R15 15
+
+void IMM8(uint8_t **p, uint32_t imm) {
+ *(*p)++ = (imm & 0xff);
+}
+
+void IMM32(uint8_t **p, uint32_t imm) {
+ int i;
+ for(i=0;i<4;i++) {
+ *(*p)++ = (imm & (0xff << (i*8))) >> (i*8);
+ }
+}
+void IMM32_NI(uint8_t *p, uint32_t imm) {
+ int i;
+ for(i=0;i<4;i++) {
+ *(p+i) = (imm & (0xff << (i*8))) >> (i*8);
+ }
+}
+
+uint32_t READ_IMM32(uint8_t *p) {
+ uint32_t rval = 0;
+ int i;
+ for(i=0;i<4;i++) {
+ rval |= *(p+i) << (i*8);
+ }
+ return rval;
+}
+
+void MOVI(uint8_t **p, uint8_t dst, uint32_t imm) {
+ if(dst < 8) {
+ *(*p)++ = 0xb8 + dst;
+ }else{
+ *(*p)++ = 0x49;
+ *(*p)++ = 0xc7;
+ *(*p)++ = 0xc0 | (dst - 8);
+ }
+ IMM32(p, imm);
+}
+
+void ADDRMODE(uint8_t **p, uint8_t reg, uint8_t rm, int32_t disp) {
+ if(disp == 0) {
+ *(*p)++ = (rm & 7) | ((reg & 7) << 3);
+ }else if(disp <= 127 || disp >= -128) {
+ *(*p)++ = 0x40 | (rm & 7) | ((reg & 7) << 3);
+ IMM8(p, disp);
+ }else{
+ *(*p)++ = 0x80 | (rm & 7) | ((reg & 7) << 3);
+ IMM32(p, disp);
+ }
+}
+
+void LEA(uint8_t **p, uint8_t dst, uint8_t base, int32_t disp) {
+
+ *(*p)++ = 0x48 | ((base & 0x8) >> 3) | ((dst & 0x8) >> 1);
+ *(*p)++ = 0x8d;
+ ADDRMODE(p, dst, base, disp);
+}
+
+void RET(uint8_t **p) {
+ *(*p)++ = 0xc3;
+}
+
+#endif
diff --git a/src/cp_sse.c b/src/cp_sse.c
index 088798b..f9a5c8f 100644
--- a/src/cp_sse.c
+++ b/src/cp_sse.c
@@ -308,8 +308,7 @@ ffts_plan_t *ffts_init(size_t N, int sign) {
p->N = N;
p->lastlut = w;
p->n_luts = n_luts;
- if(N>=32)
- p->transform = ffts_generate_func_code(p, N, leafN);
+ if(N>=32) ffts_generate_func_code(p, N, leafN);
// fprintf(stderr, "sizeof(size_t) == %lu\n", sizeof(size_t));
return p;
diff --git a/src/neon.h b/src/neon.h
new file mode 100644
index 0000000..16cb38d
--- /dev/null
+++ b/src/neon.h
@@ -0,0 +1,17 @@
+#ifndef __NEON_H__
+#define __NEON_H__
+
+static const __attribute__ ((aligned(16))) float ee_w_data[4] = {0.70710678118654757273731092936941,0.70710678118654746171500846685376,
+ -0.70710678118654757273731092936941,-0.70710678118654746171500846685376};
+static const __attribute__ ((aligned(16))) data_t oe_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376};
+static const __attribute__ ((aligned(16))) data_t eo_w_data[4] = {1.0f,0.70710678118654757273731092936941f, 0.0f,-0.70710678118654746171500846685376};
+void neon_x4(float *, size_t, float *);
+void neon_x8(float *, size_t, float *);
+void neon_x8_t(float *, size_t, float *);
+void neon_ee();
+void neon_oo();
+void neon_eo();
+void neon_oe();
+void neon_end();
+
+#endif
diff --git a/src/patterns.c b/src/patterns.c
index 664f20e..29fa5ae 100644
--- a/src/patterns.c
+++ b/src/patterns.c
@@ -114,9 +114,9 @@ void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) {
for(i=0;i<N/leafN;i++) {
p->offsets[i] = offsets[i*2+1]*2;
}
-//for(i=0;i<N/leafN;i++) {
-// printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N)));
-//}
+ for(i=0;i<N/leafN;i++) {
+ printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N)));
+ }
free(offsets);
diff --git a/src/sse.s b/src/sse.s
new file mode 100644
index 0000000..c8e509f
--- /dev/null
+++ b/src/sse.s
@@ -0,0 +1,174 @@
+
+ .globl _neon_x4
+ .align 4
+_neon_x4:
+
+ .globl _neon_x8
+ .align 4
+_neon_x8:
+
+ .globl _neon_x8_t
+ .align 4
+_neon_x8_t:
+
+
+
+# eax is loop counter (init to 0)
+# rcx is loop max count
+# rsi is 'in' base pointer
+# rdx is 'out' base pointer
+# r8 is offsets pointer
+# r9 is constants pointer
+# scratch: rax r11 r12
+ .globl _leaf_ee
+ .align 4, 0x90
+_leaf_ee:
+ lea L_sse_constants(%rip), %r9
+ movaps 32(%r9), %xmm0 #83.5
+ movaps 0x0(%r9), %xmm8 #83.5
+ xorl %eax, %eax
+ .align 4, 0x90
+LEAF_EE_1:
+LEAF_EE_const_0:
+ movaps 0xBEBAFECA(%rsi,%rax,4), %xmm7 #83.5
+LEAF_EE_const_2:
+ movaps 0xBEBAFECA(%rsi,%rax,4), %xmm12 #83.5
+ movaps %xmm7, %xmm6 #83.5
+LEAF_EE_const_3:
+ movaps 0xBEBAFECA(%rsi,%rax,4), %xmm10 #83.5
+ movaps %xmm12, %xmm11 #83.5
+ subps %xmm10, %xmm12 #83.5
+ addps %xmm10, %xmm11 #83.5
+ xorps %xmm8, %xmm12 #83.5
+LEAF_EE_const_1:
+ movaps 0xBEBAFECA(%rsi,%rax,4), %xmm9 #83.5
+LEAF_EE_const_4:
+ movaps 0xBEBAFECA(%rsi,%rax,4), %xmm10 #83.5
+ addps %xmm9, %xmm6 #83.5
+ subps %xmm9, %xmm7 #83.5
+LEAF_EE_const_5:
+ movaps 0xBEBAFECA(%rsi,%rax,4), %xmm13 #83.5
+ movaps %xmm10, %xmm9 #83.5
+LEAF_EE_const_6:
+ movaps 0xBEBAFECA(%rsi,%rax,4), %xmm3 #83.5
+ movaps %xmm6, %xmm5 #83.5
+LEAF_EE_const_7:
+ movaps 0xBEBAFECA(%rsi,%rax,4), %xmm14 #83.5
+ movaps %xmm3, %xmm15 #83.5
+ shufps $177, %xmm12, %xmm12 #83.5
+ movaps %xmm7, %xmm4 #83.5
+ movslq (%r8, %rax, 8), %r11 #83.44
+ subps %xmm13, %xmm10 #83.5
+ subps %xmm14, %xmm3 #83.5
+ addps %xmm11, %xmm5 #83.5
+ subps %xmm11, %xmm6 #83.5
+ subps %xmm12, %xmm4 #83.5
+ addps %xmm12, %xmm7 #83.5
+ addps %xmm13, %xmm9 #83.5
+ addps %xmm14, %xmm15 #83.5
+ movaps 16(%r9), %xmm12 #83.5
+ movaps %xmm9, %xmm1 #83.5
+ movaps 16(%r9), %xmm11 #83.5
+ movaps %xmm5, %xmm2 #83.5
+ mulps %xmm10, %xmm12 #83.5
+ subps %xmm15, %xmm9 #83.5
+ addps %xmm15, %xmm1 #83.5
+ mulps %xmm3, %xmm11 #83.5
+ addps %xmm1, %xmm2 #83.5
+ subps %xmm1, %xmm5 #83.5
+ shufps $177, %xmm10, %xmm10 #83.5
+ xorps %xmm8, %xmm9 #83.5
+ shufps $177, %xmm3, %xmm3 #83.5
+ movaps %xmm6, %xmm1 #83.5
+ mulps %xmm0, %xmm10 #83.5
+ movaps %xmm4, %xmm13 #83.5
+ mulps %xmm0, %xmm3 #83.5
+ subps %xmm10, %xmm12 #83.5
+ addps %xmm3, %xmm11 #83.5
+ movaps %xmm12, %xmm3 #83.5
+ movaps %xmm7, %xmm14 #83.5
+ shufps $177, %xmm9, %xmm9 #83.5
+ subps %xmm11, %xmm12 #83.5
+ addps %xmm11, %xmm3 #83.5
+ subps %xmm9, %xmm1 #83.5
+ addps %xmm9, %xmm6 #83.5
+ addps %xmm3, %xmm4 #83.5
+ subps %xmm3, %xmm13 #83.5
+ xorps %xmm8, %xmm12 #83.5
+ movaps %xmm2, %xmm3 #83.5
+ shufps $177, %xmm12, %xmm12 #83.5
+ movaps %xmm6, %xmm9 #83.5
+ movslq 4(%r8, %rax, 8), %r12 #83.59
+ movlhps %xmm4, %xmm3 #83.5
+ addq $4, %rax
+ shufps $238, %xmm4, %xmm2 #83.5
+ movaps %xmm1, %xmm4 #83.5
+ movaps %xmm3, (%rdx,%r11,4) #83.5
+ subps %xmm12, %xmm7 #83.5
+ addps %xmm12, %xmm14 #83.5
+ movlhps %xmm7, %xmm4 #83.5
+ shufps $238, %xmm7, %xmm1 #83.5
+ movaps %xmm5, %xmm7 #83.5
+ movlhps %xmm13, %xmm7 #83.5
+ movlhps %xmm14, %xmm9 #83.5
+ shufps $238, %xmm13, %xmm5 #83.5
+ shufps $238, %xmm14, %xmm6 #83.5
+ movaps %xmm4, 16(%rdx,%r11,4) #83.5
+ movaps %xmm7, 32(%rdx,%r11,4) #83.5
+ movaps %xmm9, 48(%rdx,%r11,4) #83.5
+ movaps %xmm2, (%rdx,%r12,4) #83.5
+ movaps %xmm1, 16(%rdx,%r12,4) #83.5
+ movaps %xmm5, 32(%rdx,%r12,4) #83.5
+ movaps %xmm6, 48(%rdx,%r12,4) #83.5
+ cmpq %rcx, %rax
+ jne LEAF_EE_1
+
+ .globl _neon_oo
+ .align 4
+_neon_oo:
+
+
+ .globl _neon_eo
+ .align 4
+_neon_eo:
+
+
+ .globl _neon_oe
+ .align 4
+_neon_oe:
+
+
+ .globl _neon_end
+ .align 4
+_neon_end:
+
+
+ .globl _sse_leaf_ee_offsets
+ .align 4
+_sse_leaf_ee_offsets:
+ .long LEAF_EE_const_0-_leaf_ee+0x4
+ .long LEAF_EE_const_1-_leaf_ee+0x5
+ .long LEAF_EE_const_2-_leaf_ee+0x5
+ .long LEAF_EE_const_3-_leaf_ee+0x5
+ .long LEAF_EE_const_4-_leaf_ee+0x5
+ .long LEAF_EE_const_5-_leaf_ee+0x5
+ .long LEAF_EE_const_6-_leaf_ee+0x4
+ .long LEAF_EE_const_7-_leaf_ee+0x5
+
+ .section __TEXT, __const
+ .align 4
+L_sse_constants:
+L_2il0floatpacket.719:
+ .long 0x00000000,0x80000000,0x00000000,0x80000000
+ .align 4
+L_2il0floatpacket.720:
+ .long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
+ .align 4
+L_2il0floatpacket.721:
+ .long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
+ .align 4
+L_2il0floatpacket.722:
+ .long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
+ .align 4
+L_2il0floatpacket.723:
+ .long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
OpenPOWER on IntegriCloud