Optimize sum() functions

author: Erik Schnetter <schnetter@gmail.com> 2012-12-02 22:48:15 -0500
committer: Erik Schnetter <schnetter@gmail.com> 2012-12-02 22:48:15 -0500
commit: 94b89a202430d972b04c8ed3b65a832127e38de8 (patch)
tree: 79bbf338ee27bdaf5962627222e254ec3c9a9c68
parent: 8f470967080fbff980f77d9dcd9656e9a152abf9 (diff)
download: vecmathlib-94b89a202430d972b04c8ed3b65a832127e38de8.zip
vecmathlib-94b89a202430d972b04c8ed3b65a832127e38de8.tar.gz
9 files changed, 132 insertions, 51 deletions
diff --git a/.gitignore b/.gitignore
index b69d47d..979b68a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ old
 test
 .ninja_log
 OUT
+test.out
diff --git a/build.ninja b/build.ninja
index fbfd02b..23566c4 100644
--- a/build.ninja
+++ b/build.ninja
@@ -2,8 +2,7 @@ ar  = ar
 cxx = g++
 
 cppflags =
-cxxflags = -std=gnu++11 -Wall -g -march=native
-# -Ofast
+cxxflags = -std=gnu++11 -Wall -g -march=native -Ofast
 ldflags = -L.
 
 rule cxx
diff --git a/example.cc b/example.cc
index 50415c2..f3150b0 100644
--- a/example.cc
+++ b/example.cc
@@ -1,12 +1,47 @@
 // -*-C++-*-
 
-#include "vec_float.h"
-#include "vec_double_avx.h"
+#include "vecmathlib.h"
 
 #include <iostream>
 
 using namespace std;
 
+float arg_f1(float x) { return -vecmathlib::realvec<float,1>(x)[0]; }
+float arg_f4(__m128 x) { return -vecmathlib::realvec<float,4>(x)[0]; }
+float arg_f8(__m256 x) { return -vecmathlib::realvec<float,8>(x)[0]; }
+
+int size_f1(float x) { return sizeof(vecmathlib::realvec<float,1>); }
+int size_f4(__m128 x) { return sizeof(vecmathlib::realvec<float,4>); }
+int size_f8(__m256 x) { return sizeof(vecmathlib::realvec<float,8>); }
+
+float sum_f1(vecmathlib::realvec<float,1> x) { return sum(x); }
+float sum_f4(vecmathlib::realvec<float,4> x) { return sum(x); }
+float sum_f8(vecmathlib::realvec<float,8> x) { return sum(x); }
+double sum_d1(vecmathlib::realvec<double,1> x) { return sum(x); }
+double sum_d2(vecmathlib::realvec<double,2> x) { return sum(x); }
+double sum_d4(vecmathlib::realvec<double,4> x) { return sum(x); }
+
+float elt0_f1(vecmathlib::realvec<float,1> x) { return x[0]; }
+float elt0_f4(vecmathlib::realvec<float,4> x) { return x[0]; }
+float elt1_f4(vecmathlib::realvec<float,4> x) { return x[1]; }
+float elt2_f4(vecmathlib::realvec<float,4> x) { return x[2]; }
+float elt3_f4(vecmathlib::realvec<float,4> x) { return x[3]; }
+float elt0_f8(vecmathlib::realvec<float,8> x) { return x[0]; }
+float elt1_f8(vecmathlib::realvec<float,8> x) { return x[1]; }
+float elt2_f8(vecmathlib::realvec<float,8> x) { return x[2]; }
+float elt3_f8(vecmathlib::realvec<float,8> x) { return x[3]; }
+float elt4_f8(vecmathlib::realvec<float,8> x) { return x[4]; }
+float elt5_f8(vecmathlib::realvec<float,8> x) { return x[5]; }
+float elt6_f8(vecmathlib::realvec<float,8> x) { return x[6]; }
+float elt7_f8(vecmathlib::realvec<float,8> x) { return x[7]; }
+double elt0_d1(vecmathlib::realvec<double,1> x) { return x[0]; }
+double elt0_d2(vecmathlib::realvec<double,2> x) { return x[0]; }
+double elt1_d2(vecmathlib::realvec<double,2> x) { return x[1]; }
+double elt0_d4(vecmathlib::realvec<double,4> x) { return x[0]; }
+double elt1_d4(vecmathlib::realvec<double,4> x) { return x[1]; }
+double elt2_d4(vecmathlib::realvec<double,4> x) { return x[2]; }
+double elt3_d4(vecmathlib::realvec<double,4> x) { return x[3]; }
+
 int main(int argc, char** argv)
 {
   using namespace vecmathlib;
diff --git a/vec_double.h b/vec_double.h
index 1f1a1dd..5074ac2 100644
--- a/vec_double.h
+++ b/vec_double.h
@@ -39,12 +39,6 @@ namespace vecmathlib {
     typedef boolvec_t BV;
     typedef floatprops<real_t> FP;
     typedef mathfuncs<realvec_t> MF;
-    // static real_t R(double a) { return real_t(a); }
-    // static int_t I(int a) { return int_t(a); }
-    // static uint_t U(int a) { return uint_t(a); }
-    // static realvec_t RV(real_t a) { return realvec_t(a); }
-    // static intvec_t IV(int_t a) { return intvec_t(a); }
-    // static boolvec_t BV(bool a) { return boolvec_t(a); }
     
     
     
@@ -110,12 +104,6 @@ namespace vecmathlib {
     typedef boolvec_t BV;
     typedef floatprops<real_t> FP;
     typedef mathfuncs<realvec_t> MF;
-    // static real_t R(double a) { return real_t(a); }
-    // static int_t I(int a) { return int_t(a); }
-    // static uint_t U(int a) { return uint_t(a); }
-    // static realvec_t RV(real_t a) { return realvec_t(a); }
-    // static intvec_t IV(int_t a) { return intvec_t(a); }
-    // static boolvec_t BV(bool a) { return boolvec_t(a); }
     
     
     
@@ -204,12 +192,6 @@ namespace vecmathlib {
     typedef boolvec_t BV;
     typedef floatprops<real_t> FP;
     typedef mathfuncs<realvec_t> MF;
-    // static real_t R(double a) { return real_t(a); }
-    // static int_t I(int a) { return int_t(a); }
-    // static uint_t U(int a) { return uint_t(a); }
-    // static realvec_t RV(real_t a) { return realvec_t(a); }
-    // static intvec_t IV(int_t a) { return intvec_t(a); }
-    // static boolvec_t BV(bool a) { return boolvec_t(a); }
     
     
     
diff --git a/vec_double_avx.h b/vec_double_avx.h
index 081045f..3c537f8 100644
--- a/vec_double_avx.h
+++ b/vec_double_avx.h
@@ -372,7 +372,19 @@ namespace vecmathlib {
     realvec(real_t const* as): v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {}
     
     operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    real_t operator[](int n) const
+    {
+      // return ((real_t const*)&v)[n];
+      __m128d vlo = _mm256_extractf128_pd(v, 0);
+      __m128d vhi = _mm256_extractf128_pd(v, 1);
+      switch (n){
+      case 0: return _mm_cvtsd_f64(vlo);
+      case 1: return _mm_cvtsd_f64(_mm_shuffle_pd(vlo, vlo, _MM_SHUFFLE2(0,1)));
+      case 2: return _mm_cvtsd_f64(vhi);
+      case 3: return _mm_cvtsd_f64(_mm_shuffle_pd(vhi, vhi, _MM_SHUFFLE2(0,1)));
+      }
+      assert(0);
+    }
     realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
     
     
@@ -401,7 +413,13 @@ namespace vecmathlib {
     }
     real_t sum() const
     {
-      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+      // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+      // __m256d x = _mm256_hadd_pd(v, v);
+      // __m128d xlo = _mm256_extractf128_pd(x, 0);
+      // __m128d xhi = _mm256_extractf128_pd(x, 1);
+      realvec x = *this;
+      x = _mm256_hadd_pd(x.v, x.v);
+      return x[0] + x[2];
     }
     
     
diff --git a/vec_double_sse2.h b/vec_double_sse2.h
index 1c6648c..1794422 100644
--- a/vec_double_sse2.h
+++ b/vec_double_sse2.h
@@ -288,7 +288,15 @@ namespace vecmathlib {
     realvec(real_t const* as): v(_mm_set_pd(as[1], as[0])) {}
     
     operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    real_t operator[](int n) const
+    {
+      // return ((real_t const*)&v)[n];
+      switch (n){
+      case 0: return _mm_cvtsd_f64(v);
+      case 1: return _mm_cvtsd_f64(_mm_shuffle_pd(v, v, _MM_SHUFFLE2(0,1)));
+      }
+      assert(0);
+    }
     realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
     
     
@@ -317,7 +325,8 @@ namespace vecmathlib {
     }
     real_t sum() const
     {
-      return (*this)[0] + (*this)[1];
+      // return (*this)[0] + (*this)[1];
+      return _mm_cvtsd_f64(_mm_hadd_pd(v, v));
     }
     
     
diff --git a/vec_float.h b/vec_float.h
index 60636ba..7378104 100644
--- a/vec_float.h
+++ b/vec_float.h
@@ -39,12 +39,6 @@ namespace vecmathlib {
     typedef boolvec_t BV;
     typedef floatprops<real_t> FP;
     typedef mathfuncs<realvec_t> MF;
-    // static real_t R(double a) { return real_t(a); }
-    // static int_t I(int a) { return int_t(a); }
-    // static uint_t U(int a) { return uint_t(a); }
-    // static realvec_t RV(real_t a) { return realvec_t(a); }
-    // static intvec_t IV(int_t a) { return intvec_t(a); }
-    // static boolvec_t BV(bool a) { return boolvec_t(a); }
     
     
     
@@ -110,12 +104,6 @@ namespace vecmathlib {
     typedef boolvec_t BV;
     typedef floatprops<real_t> FP;
     typedef mathfuncs<realvec_t> MF;
-    // static real_t R(double a) { return real_t(a); }
-    // static int_t I(int a) { return int_t(a); }
-    // static uint_t U(int a) { return uint_t(a); }
-    // static realvec_t RV(real_t a) { return realvec_t(a); }
-    // static intvec_t IV(int_t a) { return intvec_t(a); }
-    // static boolvec_t BV(bool a) { return boolvec_t(a); }
     
     
     
@@ -204,12 +192,6 @@ namespace vecmathlib {
     typedef boolvec_t BV;
     typedef floatprops<real_t> FP;
     typedef mathfuncs<realvec_t> MF;
-    // static real_t R(double a) { return real_t(a); }
-    // static int_t I(int a) { return int_t(a); }
-    // static uint_t U(int a) { return uint_t(a); }
-    // static realvec_t RV(real_t a) { return realvec_t(a); }
-    // static intvec_t IV(int_t a) { return intvec_t(a); }
-    // static boolvec_t BV(bool a) { return boolvec_t(a); }
     
     
     
diff --git a/vec_float_avx.h b/vec_float_avx.h
index 4a19673..e60f1eb 100644
--- a/vec_float_avx.h
+++ b/vec_float_avx.h
@@ -95,11 +95,15 @@ namespace vecmathlib {
     
     bool all() const
     {
-      return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+      return
+        (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] &&
+        (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7];
     }
     bool any() const
     {
-      return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+      return
+        (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] ||
+        (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7];
     }
     
     
@@ -350,7 +354,31 @@ namespace vecmathlib {
                                                as[3], as[2], as[1], as[0])) {}
     
     operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    real_t operator[](int n) const
+    {
+      // return ((real_t const*)&v)[n];
+      __m128 vlo = _mm256_extractf128_ps(v, 0);
+      __m128 vhi = _mm256_extractf128_ps(v, 1);
+      switch (n){
+      case 0:
+        return _mm_cvtss_f32(vlo);
+      case 1:
+        return _mm_cvtss_f32(_mm_shuffle_ps(vlo, vlo, _MM_SHUFFLE(2,3,0,1)));
+      case 2:
+        return _mm_cvtss_f32(_mm_shuffle_ps(vlo, vlo, _MM_SHUFFLE(1,0,3,2)));
+      case 3:
+        return _mm_cvtss_f32(_mm_shuffle_ps(vlo, vlo, _MM_SHUFFLE(0,1,2,3)));
+      case 4:
+        return _mm_cvtss_f32(vhi);
+      case 5:
+        return _mm_cvtss_f32(_mm_shuffle_ps(vhi, vhi, _MM_SHUFFLE(2,3,0,1)));
+      case 6:
+        return _mm_cvtss_f32(_mm_shuffle_ps(vhi, vhi, _MM_SHUFFLE(1,0,3,2)));
+      case 7:
+        return _mm_cvtss_f32(_mm_shuffle_ps(vhi, vhi, _MM_SHUFFLE(0,1,2,3)));
+      }
+      assert(0);
+    }
     realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
     
     
@@ -375,11 +403,24 @@ namespace vecmathlib {
     
     real_t prod() const
     {
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+      return
+        (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] *
+        (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7];
     }
     real_t sum() const
     {
-      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+      // return
+      //   (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3] +
+      //   (*this)[4] + (*this)[5] + (*this)[6] + (*this)[7];
+      // _m256 x = vhaddps(v, v);
+      // x = vhaddps(x, x);
+      // __m128 xlo = _mm256_extractf128_ps(x, 0);
+      // __m128 xhi = _mm256_extractf128_ps(x, 1);
+      // return _mm_cvtsd_f64(xlo) + _mm_cvtsd_f64(xhi);
+      realvec x = *this;
+      x = _mm256_hadd_ps(x.v, x.v);
+      x = _mm256_hadd_ps(x.v, x.v);
+      return x[0] + x[4];
     }
     
     
diff --git a/vec_float_sse2.h b/vec_float_sse2.h
index f99eae8..635e018 100644
--- a/vec_float_sse2.h
+++ b/vec_float_sse2.h
@@ -259,7 +259,17 @@ namespace vecmathlib {
     realvec(real_t const* as): v(_mm_set_ps(as[3], as[2], as[1], as[0])) {}
     
     operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    real_t operator[](int n) const
+    {
+      // return ((real_t const*)&v)[n];
+      switch (n){
+      case 0: return _mm_cvtss_f32(v);
+      case 1: return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2,3,0,1)));
+      case 2: return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1,0,3,2)));
+      case 3: return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0,1,2,3)));
+      }
+      assert(0);
+    }
     realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
     
     
@@ -288,7 +298,11 @@ namespace vecmathlib {
     }
     real_t sum() const
     {
-      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+      // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+      realvec x = *this;
+      x = _mm_hadd_ps(x.v, x.v);
+      x = _mm_hadd_ps(x.v, x.v);
+      return x[0];
     }
author	Erik Schnetter <schnetter@gmail.com>	2012-12-02 22:48:15 -0500
committer	Erik Schnetter <schnetter@gmail.com>	2012-12-02 22:48:15 -0500
commit	94b89a202430d972b04c8ed3b65a832127e38de8 (patch)
tree	79bbf338ee27bdaf5962627222e254ec3c9a9c68
parent	8f470967080fbff980f77d9dcd9656e9a152abf9 (diff)
download	vecmathlib-94b89a202430d972b04c8ed3b65a832127e38de8.zip vecmathlib-94b89a202430d972b04c8ed3b65a832127e38de8.tar.gz