summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorErik Schnetter <schnetter@gmail.com>2012-12-02 22:48:15 -0500
committerErik Schnetter <schnetter@gmail.com>2012-12-02 22:48:15 -0500
commit94b89a202430d972b04c8ed3b65a832127e38de8 (patch)
tree79bbf338ee27bdaf5962627222e254ec3c9a9c68
parent8f470967080fbff980f77d9dcd9656e9a152abf9 (diff)
downloadvecmathlib-94b89a202430d972b04c8ed3b65a832127e38de8.zip
vecmathlib-94b89a202430d972b04c8ed3b65a832127e38de8.tar.gz
Optimize sum() functions
-rw-r--r--.gitignore1
-rw-r--r--build.ninja3
-rw-r--r--example.cc39
-rw-r--r--vec_double.h18
-rw-r--r--vec_double_avx.h22
-rw-r--r--vec_double_sse2.h13
-rw-r--r--vec_float.h18
-rw-r--r--vec_float_avx.h51
-rw-r--r--vec_float_sse2.h18
9 files changed, 132 insertions, 51 deletions
diff --git a/.gitignore b/.gitignore
index b69d47d..979b68a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ old
test
.ninja_log
OUT
+test.out
diff --git a/build.ninja b/build.ninja
index fbfd02b..23566c4 100644
--- a/build.ninja
+++ b/build.ninja
@@ -2,8 +2,7 @@ ar = ar
cxx = g++
cppflags =
-cxxflags = -std=gnu++11 -Wall -g -march=native
-# -Ofast
+cxxflags = -std=gnu++11 -Wall -g -march=native -Ofast
ldflags = -L.
rule cxx
diff --git a/example.cc b/example.cc
index 50415c2..f3150b0 100644
--- a/example.cc
+++ b/example.cc
@@ -1,12 +1,47 @@
// -*-C++-*-
-#include "vec_float.h"
-#include "vec_double_avx.h"
+#include "vecmathlib.h"
#include <iostream>
using namespace std;
+float arg_f1(float x) { return -vecmathlib::realvec<float,1>(x)[0]; }
+float arg_f4(__m128 x) { return -vecmathlib::realvec<float,4>(x)[0]; }
+float arg_f8(__m256 x) { return -vecmathlib::realvec<float,8>(x)[0]; }
+
+int size_f1(float x) { return sizeof(vecmathlib::realvec<float,1>); }
+int size_f4(__m128 x) { return sizeof(vecmathlib::realvec<float,4>); }
+int size_f8(__m256 x) { return sizeof(vecmathlib::realvec<float,8>); }
+
+float sum_f1(vecmathlib::realvec<float,1> x) { return sum(x); }
+float sum_f4(vecmathlib::realvec<float,4> x) { return sum(x); }
+float sum_f8(vecmathlib::realvec<float,8> x) { return sum(x); }
+double sum_d1(vecmathlib::realvec<double,1> x) { return sum(x); }
+double sum_d2(vecmathlib::realvec<double,2> x) { return sum(x); }
+double sum_d4(vecmathlib::realvec<double,4> x) { return sum(x); }
+
+float elt0_f1(vecmathlib::realvec<float,1> x) { return x[0]; }
+float elt0_f4(vecmathlib::realvec<float,4> x) { return x[0]; }
+float elt1_f4(vecmathlib::realvec<float,4> x) { return x[1]; }
+float elt2_f4(vecmathlib::realvec<float,4> x) { return x[2]; }
+float elt3_f4(vecmathlib::realvec<float,4> x) { return x[3]; }
+float elt0_f8(vecmathlib::realvec<float,8> x) { return x[0]; }
+float elt1_f8(vecmathlib::realvec<float,8> x) { return x[1]; }
+float elt2_f8(vecmathlib::realvec<float,8> x) { return x[2]; }
+float elt3_f8(vecmathlib::realvec<float,8> x) { return x[3]; }
+float elt4_f8(vecmathlib::realvec<float,8> x) { return x[4]; }
+float elt5_f8(vecmathlib::realvec<float,8> x) { return x[5]; }
+float elt6_f8(vecmathlib::realvec<float,8> x) { return x[6]; }
+float elt7_f8(vecmathlib::realvec<float,8> x) { return x[7]; }
+double elt0_d1(vecmathlib::realvec<double,1> x) { return x[0]; }
+double elt0_d2(vecmathlib::realvec<double,2> x) { return x[0]; }
+double elt1_d2(vecmathlib::realvec<double,2> x) { return x[1]; }
+double elt0_d4(vecmathlib::realvec<double,4> x) { return x[0]; }
+double elt1_d4(vecmathlib::realvec<double,4> x) { return x[1]; }
+double elt2_d4(vecmathlib::realvec<double,4> x) { return x[2]; }
+double elt3_d4(vecmathlib::realvec<double,4> x) { return x[3]; }
+
int main(int argc, char** argv)
{
using namespace vecmathlib;
diff --git a/vec_double.h b/vec_double.h
index 1f1a1dd..5074ac2 100644
--- a/vec_double.h
+++ b/vec_double.h
@@ -39,12 +39,6 @@ namespace vecmathlib {
typedef boolvec_t BV;
typedef floatprops<real_t> FP;
typedef mathfuncs<realvec_t> MF;
- // static real_t R(double a) { return real_t(a); }
- // static int_t I(int a) { return int_t(a); }
- // static uint_t U(int a) { return uint_t(a); }
- // static realvec_t RV(real_t a) { return realvec_t(a); }
- // static intvec_t IV(int_t a) { return intvec_t(a); }
- // static boolvec_t BV(bool a) { return boolvec_t(a); }
@@ -110,12 +104,6 @@ namespace vecmathlib {
typedef boolvec_t BV;
typedef floatprops<real_t> FP;
typedef mathfuncs<realvec_t> MF;
- // static real_t R(double a) { return real_t(a); }
- // static int_t I(int a) { return int_t(a); }
- // static uint_t U(int a) { return uint_t(a); }
- // static realvec_t RV(real_t a) { return realvec_t(a); }
- // static intvec_t IV(int_t a) { return intvec_t(a); }
- // static boolvec_t BV(bool a) { return boolvec_t(a); }
@@ -204,12 +192,6 @@ namespace vecmathlib {
typedef boolvec_t BV;
typedef floatprops<real_t> FP;
typedef mathfuncs<realvec_t> MF;
- // static real_t R(double a) { return real_t(a); }
- // static int_t I(int a) { return int_t(a); }
- // static uint_t U(int a) { return uint_t(a); }
- // static realvec_t RV(real_t a) { return realvec_t(a); }
- // static intvec_t IV(int_t a) { return intvec_t(a); }
- // static boolvec_t BV(bool a) { return boolvec_t(a); }
diff --git a/vec_double_avx.h b/vec_double_avx.h
index 081045f..3c537f8 100644
--- a/vec_double_avx.h
+++ b/vec_double_avx.h
@@ -372,7 +372,19 @@ namespace vecmathlib {
realvec(real_t const* as): v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {}
operator vector_t() const { return v; }
- real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+ real_t operator[](int n) const
+ {
+ // return ((real_t const*)&v)[n];
+ __m128d vlo = _mm256_extractf128_pd(v, 0);
+ __m128d vhi = _mm256_extractf128_pd(v, 1);
+ switch (n){
+ case 0: return _mm_cvtsd_f64(vlo);
+ case 1: return _mm_cvtsd_f64(_mm_shuffle_pd(vlo, vlo, _MM_SHUFFLE2(0,1)));
+ case 2: return _mm_cvtsd_f64(vhi);
+ case 3: return _mm_cvtsd_f64(_mm_shuffle_pd(vhi, vhi, _MM_SHUFFLE2(0,1)));
+ }
+ assert(0);
+ }
realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
@@ -401,7 +413,13 @@ namespace vecmathlib {
}
real_t sum() const
{
- return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+ // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+ // __m256d x = _mm256_hadd_pd(v, v);
+ // __m128d xlo = _mm256_extractf128_pd(x, 0);
+ // __m128d xhi = _mm256_extractf128_pd(x, 1);
+ realvec x = *this;
+ x = _mm256_hadd_pd(x.v, x.v);
+ return x[0] + x[2];
}
diff --git a/vec_double_sse2.h b/vec_double_sse2.h
index 1c6648c..1794422 100644
--- a/vec_double_sse2.h
+++ b/vec_double_sse2.h
@@ -288,7 +288,15 @@ namespace vecmathlib {
realvec(real_t const* as): v(_mm_set_pd(as[1], as[0])) {}
operator vector_t() const { return v; }
- real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+ real_t operator[](int n) const
+ {
+ // return ((real_t const*)&v)[n];
+ switch (n){
+ case 0: return _mm_cvtsd_f64(v);
+ case 1: return _mm_cvtsd_f64(_mm_shuffle_pd(v, v, _MM_SHUFFLE2(0,1)));
+ }
+ assert(0);
+ }
realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
@@ -317,7 +325,8 @@ namespace vecmathlib {
}
real_t sum() const
{
- return (*this)[0] + (*this)[1];
+ // return (*this)[0] + (*this)[1];
+ return _mm_cvtsd_f64(_mm_hadd_pd(v, v));
}
diff --git a/vec_float.h b/vec_float.h
index 60636ba..7378104 100644
--- a/vec_float.h
+++ b/vec_float.h
@@ -39,12 +39,6 @@ namespace vecmathlib {
typedef boolvec_t BV;
typedef floatprops<real_t> FP;
typedef mathfuncs<realvec_t> MF;
- // static real_t R(double a) { return real_t(a); }
- // static int_t I(int a) { return int_t(a); }
- // static uint_t U(int a) { return uint_t(a); }
- // static realvec_t RV(real_t a) { return realvec_t(a); }
- // static intvec_t IV(int_t a) { return intvec_t(a); }
- // static boolvec_t BV(bool a) { return boolvec_t(a); }
@@ -110,12 +104,6 @@ namespace vecmathlib {
typedef boolvec_t BV;
typedef floatprops<real_t> FP;
typedef mathfuncs<realvec_t> MF;
- // static real_t R(double a) { return real_t(a); }
- // static int_t I(int a) { return int_t(a); }
- // static uint_t U(int a) { return uint_t(a); }
- // static realvec_t RV(real_t a) { return realvec_t(a); }
- // static intvec_t IV(int_t a) { return intvec_t(a); }
- // static boolvec_t BV(bool a) { return boolvec_t(a); }
@@ -204,12 +192,6 @@ namespace vecmathlib {
typedef boolvec_t BV;
typedef floatprops<real_t> FP;
typedef mathfuncs<realvec_t> MF;
- // static real_t R(double a) { return real_t(a); }
- // static int_t I(int a) { return int_t(a); }
- // static uint_t U(int a) { return uint_t(a); }
- // static realvec_t RV(real_t a) { return realvec_t(a); }
- // static intvec_t IV(int_t a) { return intvec_t(a); }
- // static boolvec_t BV(bool a) { return boolvec_t(a); }
diff --git a/vec_float_avx.h b/vec_float_avx.h
index 4a19673..e60f1eb 100644
--- a/vec_float_avx.h
+++ b/vec_float_avx.h
@@ -95,11 +95,15 @@ namespace vecmathlib {
bool all() const
{
- return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+ return
+ (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] &&
+ (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7];
}
bool any() const
{
- return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+ return
+ (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] ||
+ (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7];
}
@@ -350,7 +354,31 @@ namespace vecmathlib {
as[3], as[2], as[1], as[0])) {}
operator vector_t() const { return v; }
- real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+ real_t operator[](int n) const
+ {
+ // return ((real_t const*)&v)[n];
+ __m128 vlo = _mm256_extractf128_ps(v, 0);
+ __m128 vhi = _mm256_extractf128_ps(v, 1);
+ switch (n){
+ case 0:
+ return _mm_cvtss_f32(vlo);
+ case 1:
+ return _mm_cvtss_f32(_mm_shuffle_ps(vlo, vlo, _MM_SHUFFLE(2,3,0,1)));
+ case 2:
+ return _mm_cvtss_f32(_mm_shuffle_ps(vlo, vlo, _MM_SHUFFLE(1,0,3,2)));
+ case 3:
+ return _mm_cvtss_f32(_mm_shuffle_ps(vlo, vlo, _MM_SHUFFLE(0,1,2,3)));
+ case 4:
+ return _mm_cvtss_f32(vhi);
+ case 5:
+ return _mm_cvtss_f32(_mm_shuffle_ps(vhi, vhi, _MM_SHUFFLE(2,3,0,1)));
+ case 6:
+ return _mm_cvtss_f32(_mm_shuffle_ps(vhi, vhi, _MM_SHUFFLE(1,0,3,2)));
+ case 7:
+ return _mm_cvtss_f32(_mm_shuffle_ps(vhi, vhi, _MM_SHUFFLE(0,1,2,3)));
+ }
+ assert(0);
+ }
realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
@@ -375,11 +403,24 @@ namespace vecmathlib {
real_t prod() const
{
- return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+ return
+ (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] *
+ (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7];
}
real_t sum() const
{
- return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+ // return
+ // (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3] +
+ // (*this)[4] + (*this)[5] + (*this)[6] + (*this)[7];
+ // _m256 x = vhaddps(v, v);
+ // x = vhaddps(x, x);
+ // __m128 xlo = _mm256_extractf128_ps(x, 0);
+ // __m128 xhi = _mm256_extractf128_ps(x, 1);
+ // return _mm_cvtsd_f64(xlo) + _mm_cvtsd_f64(xhi);
+ realvec x = *this;
+ x = _mm256_hadd_ps(x.v, x.v);
+ x = _mm256_hadd_ps(x.v, x.v);
+ return x[0] + x[4];
}
diff --git a/vec_float_sse2.h b/vec_float_sse2.h
index f99eae8..635e018 100644
--- a/vec_float_sse2.h
+++ b/vec_float_sse2.h
@@ -259,7 +259,17 @@ namespace vecmathlib {
realvec(real_t const* as): v(_mm_set_ps(as[3], as[2], as[1], as[0])) {}
operator vector_t() const { return v; }
- real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+ real_t operator[](int n) const
+ {
+ // return ((real_t const*)&v)[n];
+ switch (n){
+ case 0: return _mm_cvtss_f32(v);
+ case 1: return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2,3,0,1)));
+ case 2: return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1,0,3,2)));
+ case 3: return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0,1,2,3)));
+ }
+ assert(0);
+ }
realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
@@ -288,7 +298,11 @@ namespace vecmathlib {
}
real_t sum() const
{
- return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+ // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+ realvec x = *this;
+ x = _mm_hadd_ps(x.v, x.v);
+ x = _mm_hadd_ps(x.v, x.v);
+ return x[0];
}
OpenPOWER on IntegriCloud