summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--test.cc155
-rw-r--r--vec_base.h70
-rw-r--r--vec_double.h30
-rw-r--r--vec_double_avx.h102
-rw-r--r--vec_double_sse2.h107
-rw-r--r--vec_float.h30
-rw-r--r--vec_float_avx.h83
-rw-r--r--vec_float_sse2.h95
-rw-r--r--vec_mask.h54
-rw-r--r--vec_pseudo.h131
11 files changed, 858 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index 99bfbed..7854e42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ Makefile
bench
rules.ninja
build.ninja
+loop
diff --git a/test.cc b/test.cc
index 3d2ef88..bb174c3 100644
--- a/test.cc
+++ b/test.cc
@@ -71,6 +71,51 @@ struct vecmathlib_test {
+ static void check_mem(char const* const func,
+ real_t const* p,
+ realvec_t x,
+ realvec_t xorig,
+ int mval)
+ {
+ realvec_t y;
+ for (int i=0; i<realvec_t::size; ++i) {
+ y.set_elt(i, mval & (1<<i) ? p[i] : xorig[i]);
+ }
+ boolvec_t isbad = x != y;
+ if (any(isbad)) {
+ ++ num_errors;
+ cout << setprecision(realvec_t::digits10+2)
+ << "Error in " << func << ":\n"
+ << " found=" << x << "\n"
+ << " expected=" << y << "\n"
+ << " isbad=" << isbad << "\n"
+ << flush;
+ }
+ }
+
+ static void check_mem(char const* const func,
+ real_t const* p,
+ realvec_t x,
+ real_t const* porig,
+ int mval)
+ {
+ realvec_t pvec, y;
+ for (int i=0; i<realvec_t::size; ++i) {
+ pvec.set_elt(i, p[i]);
+ y.set_elt(i, mval & (1<<i) ? x[i] : porig[i]);
+ }
+ boolvec_t isbad = pvec != y;
+ if (any(isbad)) {
+ ++ num_errors;
+ cout << setprecision(realvec_t::digits10+2)
+ << "Error in " << func << ":\n"
+ << " found=" << pvec << "\n"
+ << " expected=" << y << "\n"
+ << " isbad=" << isbad << "\n"
+ << flush;
+ }
+ }
+
template<typename A>
static void check(char const* const func,
real_t fstd(typename A::scalar_t), realvec_t fvml(A),
@@ -214,6 +259,114 @@ struct vecmathlib_test {
+ static void test_mem()
+ {
+ cout << " testing loada loadu storea storeu (errors may lead to segfaults)...\n" << flush;
+ int const n = 6;
+ realvec_t x[n], xnew[n];
+ for (int i=0; i<n; ++i) {
+ x[i] = random(R(-10.0), R(+10.0));
+ }
+ realvec_t const z = random(R(-10.0), R(+10.0));
+
+ // loada
+ {
+ real_t *p = (real_t*)&x[1];
+ realvec_t y = realvec_t::loada(p);
+ check_mem("loada", p, y, z, ~0);
+ }
+
+ // loadu
+ for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+ real_t *p = (real_t*)&x[1];
+ realvec_t y = realvec_t::loadu(p+i);
+ check_mem("loadu", p+i, y, z, ~0);
+ }
+
+ // loadu(ioff)
+ for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+ real_t *p = (real_t*)&x[1];
+ realvec_t y = realvec_t::loadu(p, ioff);
+ check_mem("loadu(ioff)", p+ioff, y, z, ~0);
+ }
+
+ // storea
+ {
+ memcpy(xnew, x, n*sizeof *xnew);
+ real_t *p = (real_t*)&xnew[1];
+ storea(z, p);
+ check_mem("storea", p, z, (real_t*)&x[1], ~0);
+ }
+
+ // storeu
+ for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+ memcpy(xnew, x, n*sizeof *xnew);
+ real_t *p = (real_t*)&xnew[1];
+ storeu(z, p+i);
+ check_mem("storeu", p+i, z, (real_t*)&x[1]+i, ~0);
+ }
+
+ // storeu
+ for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+ memcpy(xnew, x, n*sizeof *xnew);
+ real_t *p = (real_t*)&xnew[1];
+ storeu(z, p, ioff);
+ check_mem("storeu(ioff)", p+ioff, z, (real_t*)&x[1]+ioff, ~0);
+ }
+
+ for (int mval=0; mval<(1<<realvec_t::size); ++mval) {
+ boolvec_t mbool;
+ for (int i=0; i<realvec_t::size; ++i) mbool.set_elt(i, mval & (1<<i));
+ typename realvec_t::mask_t mask(mbool);
+
+ // loada(mask)
+ {
+ real_t *p = (real_t*)&x[1];
+ realvec_t y = loada(p, z, mask);
+ check_mem("loada(mask)", p, y, z, mval);
+ }
+
+ // loadu(mask)
+ for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+ real_t *p = (real_t*)&x[1];
+ realvec_t y = loadu(p+i, z, mask);
+ check_mem("loadu(mask)", p+i, y, z, mval);
+ }
+
+ // loadu(ioff, mask)
+ for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+ real_t *p = (real_t*)&x[1];
+ realvec_t y = loadu(p, ioff, z, mask);
+ check_mem("loadu(ioff,mask)", p+ioff, y, z, mval);
+ }
+
+ // storea
+ {
+ memcpy(xnew, x, n*sizeof *xnew);
+ real_t *p = (real_t*)&xnew[1];
+ storea(z, p, mask);
+ check_mem("storea(mask)", p, z, (real_t*)&x[1], mval);
+ }
+
+ // storeu
+ for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+ memcpy(xnew, x, n*sizeof *xnew);
+ real_t *p = (real_t*)&xnew[1];
+ storeu(z, p+i, mask);
+ check_mem("storeu(mask)", p+i, z, (real_t*)&x[1]+i, mval);
+ }
+
+ // storeu
+ for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+ memcpy(xnew, x, n*sizeof *xnew);
+ real_t *p = (real_t*)&xnew[1];
+ storeu(z, p, ioff, mask);
+ check_mem("storeu(ioff,mask)", p+ioff, z, (real_t*)&x[1]+ioff, mval);
+ }
+
+ } // for mval
+ }
+
static int_t ilogb(real_t x) { return std::ilogb(x); }
static real_t scalbn(real_t x, int_t n) { return std::scalbn(x, n); }
static void test_fabs()
@@ -406,6 +559,8 @@ struct vecmathlib_test {
cout << "\n"
<< "Testing math functions for type " << realvec_t::name() << ":\n";
+ test_mem();
+
test_fabs();
test_convert();
diff --git a/vec_base.h b/vec_base.h
index ac67ae6..78802f5 100644
--- a/vec_base.h
+++ b/vec_base.h
@@ -5,6 +5,8 @@
#include <iostream>
+#include "vec_mask.h"
+
namespace vecmathlib {
@@ -108,6 +110,74 @@ namespace vecmathlib {
// realvec wrappers
template<typename real_t, int size>
+ inline realvec<real_t, size>
+ loada(real_t const* p,
+ realvec<real_t, size> x,
+ typename realvec<real_t, size>::mask_t const& m)
+ {
+ return x.loada(p, m);
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size>
+ loadu(real_t const* p,
+ realvec<real_t, size> x,
+ typename realvec<real_t, size>::mask_t const& m)
+ {
+ return x.loadu(p, m);
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size>
+ loadu(real_t const* p, size_t ioff,
+ realvec<real_t, size> x,
+ typename realvec<real_t, size>::mask_t const& m)
+ {
+ return x.loadu(p, ioff, m);
+ }
+
+ template<typename real_t, int size>
+ inline void storea(realvec<real_t, size> x, real_t* p)
+ {
+ return x.storea(p);
+ }
+
+ template<typename real_t, int size>
+ inline void storeu(realvec<real_t, size> x, real_t* p)
+ {
+ return x.storeu(p);
+ }
+
+ template<typename real_t, int size>
+ inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff)
+ {
+ return x.storeu(p, ioff);
+ }
+
+ template<typename real_t, int size>
+ inline void storea(realvec<real_t, size> x, real_t* p,
+ typename realvec<real_t, size>::mask_t const& m)
+ {
+ return x.storea(p, m);
+ }
+
+ template<typename real_t, int size>
+ inline void storeu(realvec<real_t, size> x, real_t* p,
+ typename realvec<real_t, size>::mask_t const& m)
+ {
+ return x.storeu(p, m);
+ }
+
+ template<typename real_t, int size>
+ inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff,
+ typename realvec<real_t, size>::mask_t const &m)
+ {
+ return x.storeu(p, ioff, m);
+ }
+
+
+
+ template<typename real_t, int size>
inline intvec<real_t, size> as_int(realvec<real_t, size> x)
{
return x.as_int();
diff --git a/vec_double.h b/vec_double.h
index b2817f5..8559e28 100644
--- a/vec_double.h
+++ b/vec_double.h
@@ -219,6 +219,36 @@ namespace vecmathlib {
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const* p) { return *p; }
+ static realvec_t loadu(real_t const* p) { return *p; }
+ static realvec_t loadu(real_t const* p, size_t ioff) { return p[ioff]; }
+ realvec_t loada(real_t const* p, mask_t const& m) const
+ {
+ return m.m.ifthen(loada(p), *this);
+ }
+ realvec_t loadu(real_t const* p, mask_t const& m) const
+ {
+ return m.m.ifthen(loadu(p), *this);
+ }
+ realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+ {
+ return loadu(p+ioff, m);
+ }
+
+ void storea(real_t* p) const { *p=v; }
+ void storeu(real_t* p) const { *p=v; }
+ void storeu(real_t* p, size_t ioff) const { p[ioff]=v; }
+ void storea(real_t* p, mask_t const& m) const { if (m.all_m) storea(p); }
+ void storeu(real_t* p, mask_t const& m) const { if (m.all_m) storeu(p); }
+ void storeu(real_t* p, size_t ioff, mask_t const& m) const
+ {
+ storeu(p+ioff, m);
+ }
+
+
+
intvec_t as_int() const { return FP::as_int(v); }
intvec_t convert_int() const { return MF::vml_convert_int(v); }
diff --git a/vec_double_avx.h b/vec_double_avx.h
index 1cc9b9a..7afd41e 100644
--- a/vec_double_avx.h
+++ b/vec_double_avx.h
@@ -152,6 +152,7 @@ namespace vecmathlib {
intvec(ivector_t x): v(x) {}
intvec(int_t a): v(_mm256_set1_epi64x(a)) {}
intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {}
+ static intvec iota() { return _mm256_set_epi64x(3, 2, 1, 0); }
operator ivector_t() const { return v; }
int_t operator[](int n) const
@@ -361,6 +362,22 @@ namespace vecmathlib {
{
return (*this ^ x).convert_bool();
}
+ boolvec_t operator<(intvec const& x) const
+ {
+ return (*this - x).as_bool();
+ }
+ boolvec_t operator<=(intvec const& x) const
+ {
+ return ! (*this > x);
+ }
+ boolvec_t operator>(intvec const& x) const
+ {
+ return x < *this;
+ }
+ boolvec_t operator>=(intvec const& x) const
+ {
+ return ! (*this < x);
+ }
};
@@ -422,6 +439,91 @@ namespace vecmathlib {
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const* p)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ return _mm256_load_pd(p);
+ }
+ static realvec_t loadu(real_t const* p)
+ {
+ return _mm256_loadu_pd(p);
+ }
+ static realvec_t loadu(real_t const* p, size_t ioff)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return loada(p);
+ return loadu(p+ioff);
+ }
+ realvec_t loada(real_t const* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
+ }
+ }
+ realvec_t loadu(real_t const* p, mask_t const& m) const
+ {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
+ }
+ }
+ realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return loada(p, m);
+ return loadu(p+ioff, m);
+ }
+
+ void storea(real_t* p) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ _mm256_store_pd(p, v);
+ }
+ void storeu(real_t* p) const
+ {
+ return _mm256_storeu_pd(p, v);
+ }
+ void storeu(real_t* p, size_t ioff) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return storea(p);
+ storeu(p+ioff);
+ }
+ void storea(real_t* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ _mm256_maskstore_pd(p, m.m.as_int(), v);
+ }
+ }
+ void storeu(real_t* p, mask_t const& m) const
+ {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ if (m.m[0]) p[0] = (*this)[0];
+ if (m.m[1]) p[1] = (*this)[1];
+ if (m.m[2]) p[2] = (*this)[2];
+ if (m.m[3]) p[3] = (*this)[3];
+ }
+ }
+ void storeu(real_t* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return storea(p, m);
+ storeu(p+ioff, m);
+ }
+
+
+
intvec_t as_int() const { return _mm256_castpd_si256(v); }
intvec_t convert_int() const { return MF::vml_convert_int(*this); }
diff --git a/vec_double_sse2.h b/vec_double_sse2.h
index 83ff66a..fd82ec0 100644
--- a/vec_double_sse2.h
+++ b/vec_double_sse2.h
@@ -17,6 +17,9 @@
#ifdef __SSE4A__ // AMD's SSE 4a
# include <ammintrin.h>
#endif
+#if defined __AVX__ // Intel's AVX
+# include <immintrin.h>
+#endif
@@ -263,6 +266,22 @@ namespace vecmathlib {
{
return (*this ^ x).convert_bool();
}
+ boolvec_t operator<(intvec const& x) const
+ {
+ return (*this - x).as_bool();
+ }
+ boolvec_t operator<=(intvec const& x) const
+ {
+ return ! (*this > x);
+ }
+ boolvec_t operator>(intvec const& x) const
+ {
+ return x < *this;
+ }
+ boolvec_t operator>=(intvec const& x) const
+ {
+ return ! (*this < x);
+ }
};
@@ -323,6 +342,94 @@ namespace vecmathlib {
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const* p)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ return _mm_load_pd(p);
+ }
+ static realvec_t loadu(real_t const* p)
+ {
+ return _mm_loadu_pd(p);
+ }
+ static realvec_t loadu(real_t const* p, size_t ioff)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return loada(p);
+ return loadu(p+ioff);
+ }
+ realvec_t loada(real_t const* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
+ }
+ }
+ realvec_t loadu(real_t const* p, mask_t const& m) const
+ {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
+ }
+ }
+ realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return loada(p, m);
+ return loadu(p+ioff, m);
+ }
+
+ void storea(real_t* p) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ _mm_store_pd(p, v);
+ }
+ void storeu(real_t* p) const
+ {
+ return _mm_storeu_pd(p, v);
+ }
+ void storeu(real_t* p, size_t ioff) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return storea(p);
+ storeu(p+ioff);
+ }
+ void storea(real_t* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+#if defined __AVX__
+ _mm_maskstore_pd(p, m.m.as_int(), v);
+#else
+ if (m.m[0]) _mm_storel_pd(p , v);
+ else if (m.m[1]) _mm_storeh_pd(p+1, v);
+#endif
+ }
+ }
+ void storeu(real_t* p, mask_t const& m) const
+ {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ if (m.m[0]) _mm_storel_pd(p , v);
+ else if (m.m[1]) _mm_storeh_pd(p+1, v);
+ }
+ }
+ void storeu(real_t* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return storea(p, m);
+ storeu(p+ioff, m);
+ }
+
+
+
intvec_t as_int() const { return _mm_castpd_si128(v); }
intvec_t convert_int() const { return MF::vml_convert_int(*this); }
diff --git a/vec_float.h b/vec_float.h
index 4b32b8b..5457ec7 100644
--- a/vec_float.h
+++ b/vec_float.h
@@ -219,6 +219,36 @@ namespace vecmathlib {
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const* p) { return *p; }
+ static realvec_t loadu(real_t const* p) { return *p; }
+ static realvec_t loadu(real_t const* p, size_t ioff) { return p[ioff]; }
+ realvec_t loada(real_t const* p, mask_t const& m) const
+ {
+ return m.m.ifthen(loada(p), *this);
+ }
+ realvec_t loadu(real_t const* p, mask_t const& m) const
+ {
+ return m.m.ifthen(loadu(p), *this);
+ }
+ realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+ {
+ return loadu(p+ioff, m);
+ }
+
+ void storea(real_t* p) const { *p=v; }
+ void storeu(real_t* p) const { *p=v; }
+ void storeu(real_t* p, size_t ioff) const { p[ioff]=v; }
+ void storea(real_t* p, mask_t const& m) const { if (m.all_m) storea(p); }
+ void storeu(real_t* p, mask_t const& m) const { if (m.all_m) storeu(p); }
+ void storeu(real_t* p, size_t ioff, mask_t const& m) const
+ {
+ storeu(p+ioff, m);
+ }
+
+
+
intvec_t as_int() const { return FP::as_int(v); }
intvec_t convert_int() const { return MF::vml_convert_int(v); }
diff --git a/vec_float_avx.h b/vec_float_avx.h
index a17c473..b7f6150 100644
--- a/vec_float_avx.h
+++ b/vec_float_avx.h
@@ -404,6 +404,89 @@ namespace vecmathlib {
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const* p)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ return _mm256_load_ps(p);
+ }
+ static realvec_t loadu(real_t const* p)
+ {
+ return _mm256_loadu_ps(p);
+ }
+ static realvec_t loadu(real_t const* p, size_t ioff)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return loada(p);
+ return loadu(p+ioff);
+ }
+ realvec_t loada(real_t const* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
+ }
+ }
+ realvec_t loadu(real_t const* p, mask_t const& m) const
+ {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
+ }
+ }
+ realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return loada(p, m);
+ return loadu(p+ioff, m);
+ }
+
+ void storea(real_t* p) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ _mm256_store_ps(p, v);
+ }
+ void storeu(real_t* p) const
+ {
+ return _mm256_storeu_ps(p, v);
+ }
+ void storeu(real_t* p, size_t ioff) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return storea(p);
+ storeu(p+ioff);
+ }
+ void storea(real_t* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ _mm256_maskstore_ps(p, m.m.as_int(), v);
+ }
+ }
+ void storeu(real_t* p, mask_t const& m) const
+ {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ // TODO: this is expensive
+ for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+ }
+ }
+ void storeu(real_t* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return storea(p, m);
+ storeu(p+ioff, m);
+ }
+
+
+
intvec_t as_int() const { return _mm256_castps_si256(v); }
intvec_t convert_int() const { return _mm256_cvtps_epi32(v); }
diff --git a/vec_float_sse2.h b/vec_float_sse2.h
index 3ca7c57..f105cc5 100644
--- a/vec_float_sse2.h
+++ b/vec_float_sse2.h
@@ -17,6 +17,9 @@
#if defined __SSE4A__ // AMD's SSE 4a
# include <ammintrin.h>
#endif
+#if defined __AVX__ // Intel's AVX
+# include <immintrin.h>
+#endif
@@ -308,6 +311,98 @@ namespace vecmathlib {
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const* p)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ return _mm_load_ps(p);
+ }
+ static realvec_t loadu(real_t const* p)
+ {
+ return _mm_loadu_ps(p);
+ }
+ static realvec_t loadu(real_t const* p, size_t ioff)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return loada(p);
+ return loadu(p+ioff);
+ }
+ realvec_t loada(real_t const* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
+ }
+ }
+ realvec_t loadu(real_t const* p, mask_t const& m) const
+ {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
+ }
+ }
+ realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return loada(p, m);
+ return loadu(p+ioff, m);
+ }
+
+ void storea(real_t* p) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ _mm_store_ps(p, v);
+ }
+ void storeu(real_t* p) const
+ {
+ return _mm_storeu_ps(p, v);
+ }
+ void storeu(real_t* p, size_t ioff) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return storea(p);
+ storeu(p+ioff);
+ }
+ void storea(real_t* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+#if defined __AVX__
+ _mm_maskstore_ps(p, m.m.as_int(), v);
+#else
+ if (m.m[0]) p[0] = (*this)[0];
+ if (m.m[1]) p[1] = (*this)[1];
+ if (m.m[2]) p[2] = (*this)[2];
+ if (m.m[3]) p[3] = (*this)[3];
+#endif
+ }
+ }
+ void storeu(real_t* p, mask_t const& m) const
+ {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ if (m.m[0]) p[0] = (*this)[0];
+ if (m.m[1]) p[1] = (*this)[1];
+ if (m.m[2]) p[2] = (*this)[2];
+ if (m.m[3]) p[3] = (*this)[3];
+ }
+ }
+ void storeu(real_t* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return storea(p, m);
+ storeu(p+ioff, m);
+ }
+
+
+
intvec_t as_int() const { return _mm_castps_si128(v); }
intvec_t convert_int() const { return _mm_cvtps_epi32(v); }
diff --git a/vec_mask.h b/vec_mask.h
new file mode 100644
index 0000000..cbfcbf7
--- /dev/null
+++ b/vec_mask.h
@@ -0,0 +1,54 @@
+// -*-C++-*-
+
+#ifndef VEC_MASK_H
+#define VEC_MASK_H
+
+#include <cstdlib>
+
+
+
+namespace vecmathlib {
+
+ template<typename realvec_t>
+ class mask_t {
+ friend realvec_t;
+ typedef typename realvec_t::boolvec_t boolvec_t;
+ typedef typename realvec_t::intvec_t intvec_t;
+ static int const size = realvec_t::size;
+
+ std::ptrdiff_t imin, imax;
+ std::ptrdiff_t i;
+ boolvec_t m;
+ bool all_m;
+
+ public:
+ mask_t(boolvec_t m_): m(m_), all_m(all(m)) {}
+ mask_t(std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff):
+ imin(imin_), imax(imax_),
+ i(imin - (ioff + imin) % size)
+ {
+ all_m = i>=imin && i<=imax-size;
+ if (__builtin_expect(all_m, true)) {
+ m = true;
+ } else {
+ m = (intvec_t(i) >= intvec_t(imin ) - intvec_t::iota() &&
+ intvec_t(i) <= intvec_t(imax-size) - intvec_t::iota());
+ }
+ }
+ std::ptrdiff_t index() const { return i; }
+ operator bool() const { return i >= imax; }
+ void operator++()
+ {
+ i += size;
+ all_m = i<=imax-size;
+ if (__builtin_expect(all_m, true)) {
+ m = true;
+ } else {
+ m = intvec_t(i) <= intvec_t(imax-size) - intvec_t::iota();
+ }
+ }
+ };
+
+} // namespace vecmathlib
+
+#endif // #ifndef VEC_MASK_H
diff --git a/vec_pseudo.h b/vec_pseudo.h
index 1273a19..2df14f0 100644
--- a/vec_pseudo.h
+++ b/vec_pseudo.h
@@ -7,6 +7,7 @@
#include "mathfuncs.h"
#include "vec_base.h"
+#include <algorithm>
#include <cmath>
#include <string>
#include <typeinfo>
@@ -416,6 +417,68 @@ namespace vecmathlib {
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const* p)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ return loadu(p);
+ }
+ static realvec_t loadu(real_t const* p)
+ {
+ realvec_t res;
+ for (int d=0; d<size; ++d) res.v[d] = p[d];
+ return res;
+ }
+ static realvec_t loadu(real_t const* p, size_t ioff)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ return loadu(p+ioff);
+ }
+ realvec_t loada(real_t const* p, mask_t const& m) const
+ {
+ return m.m.ifthen(loada(p), *this);
+ }
+ realvec_t loadu(real_t const* p, mask_t const& m) const
+ {
+ return m.m.ifthen(loadu(p), *this);
+ }
+ realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+ {
+ return m.m.ifthen(loadu(p, ioff), *this);
+ }
+
+ void storea(real_t* p) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ storeu(p);
+ }
+ void storeu(real_t* p) const
+ {
+ for (int d=0; d<size; ++d) p[d] = v[d];
+ }
+ void storeu(real_t* p, size_t ioff) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ storeu(p+ioff);
+ }
+ void storea(real_t* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ storeu(p, m);
+ }
+ void storeu(real_t* p, mask_t const& m) const
+ {
+ for (int d=0; d<size; ++d) if (m.m[d]) p[d] = v[d];
+ }
+ void storeu(real_t* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ storeu(p+ioff, m);
+ }
+
+
+
intvec_t as_int() const
{
intvec_t res;
@@ -779,6 +842,74 @@ namespace vecmathlib {
// realpseudovec wrappers
template<typename real_t, int size>
+ inline realpseudovec<real_t, size>
+ loada(real_t const* p,
+ realpseudovec<real_t, size> x,
+ typename realpseudovec<real_t, size>::mask_t const& m)
+ {
+ return x.loada(p, m);
+ }
+
+ template<typename real_t, int size>
+ inline realpseudovec<real_t, size>
+ loadu(real_t const* p,
+ realpseudovec<real_t, size> x,
+ typename realpseudovec<real_t, size>::mask_t const& m)
+ {
+ return x.loadu(p, m);
+ }
+
+ template<typename real_t, int size>
+ inline realpseudovec<real_t, size>
+ loadu(real_t const* p, size_t ioff,
+ realpseudovec<real_t, size> x,
+ typename realpseudovec<real_t, size>::mask_t const& m)
+ {
+ return x.loadu(p, ioff, m);
+ }
+
+ template<typename real_t, int size>
+ inline void storea(realpseudovec<real_t, size> x, real_t* p)
+ {
+ return x.storea(p);
+ }
+
+ template<typename real_t, int size>
+ inline void storeu(realpseudovec<real_t, size> x, real_t* p)
+ {
+ return x.storeu(p);
+ }
+
+ template<typename real_t, int size>
+ inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff)
+ {
+ return x.storeu(p, ioff);
+ }
+
+ template<typename real_t, int size>
+ inline void storea(realpseudovec<real_t, size> x, real_t* p,
+ typename realpseudovec<real_t, size>::mask_t const& m)
+ {
+ return x.storea(p, m);
+ }
+
+ template<typename real_t, int size>
+ inline void storeu(realpseudovec<real_t, size> x, real_t* p,
+ typename realpseudovec<real_t, size>::mask_t const& m)
+ {
+ return x.storeu(p, m);
+ }
+
+ template<typename real_t, int size>
+ inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff,
+ typename realpseudovec<real_t, size>::mask_t const& m)
+ {
+ return x.storeu(p, ioff, m);
+ }
+
+
+
+ template<typename real_t, int size>
inline intpseudovec<real_t, size> as_int(realpseudovec<real_t, size> x)
{
return x.as_int();
OpenPOWER on IntegriCloud