diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | test.cc | 155 | ||||
-rw-r--r-- | vec_base.h | 70 | ||||
-rw-r--r-- | vec_double.h | 30 | ||||
-rw-r--r-- | vec_double_avx.h | 102 | ||||
-rw-r--r-- | vec_double_sse2.h | 107 | ||||
-rw-r--r-- | vec_float.h | 30 | ||||
-rw-r--r-- | vec_float_avx.h | 83 | ||||
-rw-r--r-- | vec_float_sse2.h | 95 | ||||
-rw-r--r-- | vec_mask.h | 54 | ||||
-rw-r--r-- | vec_pseudo.h | 131 |
11 files changed, 858 insertions, 0 deletions
@@ -14,3 +14,4 @@ Makefile bench rules.ninja build.ninja +loop @@ -71,6 +71,51 @@ struct vecmathlib_test { + static void check_mem(char const* const func, + real_t const* p, + realvec_t x, + realvec_t xorig, + int mval) + { + realvec_t y; + for (int i=0; i<realvec_t::size; ++i) { + y.set_elt(i, mval & (1<<i) ? p[i] : xorig[i]); + } + boolvec_t isbad = x != y; + if (any(isbad)) { + ++ num_errors; + cout << setprecision(realvec_t::digits10+2) + << "Error in " << func << ":\n" + << " found=" << x << "\n" + << " expected=" << y << "\n" + << " isbad=" << isbad << "\n" + << flush; + } + } + + static void check_mem(char const* const func, + real_t const* p, + realvec_t x, + real_t const* porig, + int mval) + { + realvec_t pvec, y; + for (int i=0; i<realvec_t::size; ++i) { + pvec.set_elt(i, p[i]); + y.set_elt(i, mval & (1<<i) ? x[i] : porig[i]); + } + boolvec_t isbad = pvec != y; + if (any(isbad)) { + ++ num_errors; + cout << setprecision(realvec_t::digits10+2) + << "Error in " << func << ":\n" + << " found=" << pvec << "\n" + << " expected=" << y << "\n" + << " isbad=" << isbad << "\n" + << flush; + } + } + template<typename A> static void check(char const* const func, real_t fstd(typename A::scalar_t), realvec_t fvml(A), @@ -214,6 +259,114 @@ struct vecmathlib_test { + static void test_mem() + { + cout << " testing loada loadu storea storeu (errors may lead to segfaults)...\n" << flush; + int const n = 6; + realvec_t x[n], xnew[n]; + for (int i=0; i<n; ++i) { + x[i] = random(R(-10.0), R(+10.0)); + } + realvec_t const z = random(R(-10.0), R(+10.0)); + + // loada + { + real_t *p = (real_t*)&x[1]; + realvec_t y = realvec_t::loada(p); + check_mem("loada", p, y, z, ~0); + } + + // loadu + for (ptrdiff_t i=0; i<realvec_t::size; ++i) { + real_t *p = (real_t*)&x[1]; + realvec_t y = realvec_t::loadu(p+i); + check_mem("loadu", p+i, y, z, ~0); + } + + // loadu(ioff) + for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) { + real_t *p = (real_t*)&x[1]; + realvec_t y = realvec_t::loadu(p, ioff); + check_mem("loadu(ioff)", p+ioff, y, z, ~0); + } + + // storea + { + memcpy(xnew, x, n*sizeof *xnew); + real_t *p = (real_t*)&xnew[1]; + storea(z, p); + check_mem("storea", p, z, (real_t*)&x[1], ~0); + } + + // storeu + for (ptrdiff_t i=0; i<realvec_t::size; ++i) { + memcpy(xnew, x, n*sizeof *xnew); + real_t *p = (real_t*)&xnew[1]; + storeu(z, p+i); + check_mem("storeu", p+i, z, (real_t*)&x[1]+i, ~0); + } + + // storeu + for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) { + memcpy(xnew, x, n*sizeof *xnew); + real_t *p = (real_t*)&xnew[1]; + storeu(z, p, ioff); + check_mem("storeu(ioff)", p+ioff, z, (real_t*)&x[1]+ioff, ~0); + } + + for (int mval=0; mval<(1<<realvec_t::size); ++mval) { + boolvec_t mbool; + for (int i=0; i<realvec_t::size; ++i) mbool.set_elt(i, mval & (1<<i)); + typename realvec_t::mask_t mask(mbool); + + // loada(mask) + { + real_t *p = (real_t*)&x[1]; + realvec_t y = loada(p, z, mask); + check_mem("loada(mask)", p, y, z, mval); + } + + // loadu(mask) + for (ptrdiff_t i=0; i<realvec_t::size; ++i) { + real_t *p = (real_t*)&x[1]; + realvec_t y = loadu(p+i, z, mask); + check_mem("loadu(mask)", p+i, y, z, mval); + } + + // loadu(ioff, mask) + for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) { + real_t *p = (real_t*)&x[1]; + realvec_t y = loadu(p, ioff, z, mask); + check_mem("loadu(ioff,mask)", p+ioff, y, z, mval); + } + + // storea + { + memcpy(xnew, x, n*sizeof *xnew); + real_t *p = (real_t*)&xnew[1]; + storea(z, p, mask); + check_mem("storea(mask)", p, z, (real_t*)&x[1], mval); + } + + // storeu + for (ptrdiff_t i=0; i<realvec_t::size; ++i) { + memcpy(xnew, x, n*sizeof *xnew); + real_t *p = (real_t*)&xnew[1]; + storeu(z, p+i, mask); + check_mem("storeu(mask)", p+i, z, (real_t*)&x[1]+i, mval); + } + + // storeu + for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) { + memcpy(xnew, x, n*sizeof *xnew); + real_t *p = (real_t*)&xnew[1]; + storeu(z, p, ioff, mask); + check_mem("storeu(ioff,mask)", p+ioff, z, (real_t*)&x[1]+ioff, mval); + } + + } // for mval + } + static int_t ilogb(real_t x) { return std::ilogb(x); } static real_t scalbn(real_t x, int_t n) { return std::scalbn(x, n); } static void test_fabs() @@ -406,6 +559,8 @@ struct vecmathlib_test { cout << "\n" << "Testing math functions for type " << realvec_t::name() << ":\n"; + test_mem(); + test_fabs(); test_convert(); @@ -5,6 +5,8 @@ #include <iostream> +#include "vec_mask.h" + namespace vecmathlib { @@ -108,6 +110,74 @@ namespace vecmathlib { // realvec wrappers template<typename real_t, int size> + inline realvec<real_t, size> + loada(real_t const* p, + realvec<real_t, size> x, + typename realvec<real_t, size>::mask_t const& m) + { + return x.loada(p, m); + } + + template<typename real_t, int size> + inline realvec<real_t, size> + loadu(real_t const* p, + realvec<real_t, size> x, + typename realvec<real_t, size>::mask_t const& m) + { + return x.loadu(p, m); + } + + template<typename real_t, int size> + inline realvec<real_t, size> + loadu(real_t const* p, size_t ioff, + realvec<real_t, size> x, + typename realvec<real_t, size>::mask_t const& m) + { + return x.loadu(p, ioff, m); + } + + template<typename real_t, int size> + inline void storea(realvec<real_t, size> x, real_t* p) + { + return x.storea(p); + } + + template<typename real_t, int size> + inline void storeu(realvec<real_t, size> x, real_t* p) + { + return x.storeu(p); + } + + template<typename real_t, int size> + inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff) + { + return x.storeu(p, ioff); + } + + template<typename real_t, int size> + inline void storea(realvec<real_t, size> x, real_t* p, + typename realvec<real_t, size>::mask_t const& m) + { + return x.storea(p, m); + } + + template<typename real_t, int size> + inline void storeu(realvec<real_t, size> x, real_t* p, + typename realvec<real_t, size>::mask_t const& m) + { + return x.storeu(p, m); + } + + template<typename real_t, int size> + inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff, + typename realvec<real_t, size>::mask_t const &m) + { + return x.storeu(p, ioff, m); + } + + + + template<typename real_t, int size> inline intvec<real_t, size> as_int(realvec<real_t, size> x) { return x.as_int(); diff --git a/vec_double.h b/vec_double.h index b2817f5..8559e28 100644 --- a/vec_double.h +++ b/vec_double.h @@ -219,6 +219,36 @@ namespace vecmathlib { + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const* p) { return *p; } + static realvec_t loadu(real_t const* p) { return *p; } + static realvec_t loadu(real_t const* p, size_t ioff) { return p[ioff]; } + realvec_t loada(real_t const* p, mask_t const& m) const + { + return m.m.ifthen(loada(p), *this); + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + return m.m.ifthen(loadu(p), *this); + } + realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const + { + return loadu(p+ioff, m); + } + + void storea(real_t* p) const { *p=v; } + void storeu(real_t* p) const { *p=v; } + void storeu(real_t* p, size_t ioff) const { p[ioff]=v; } + void storea(real_t* p, mask_t const& m) const { if (m.all_m) storea(p); } + void storeu(real_t* p, mask_t const& m) const { if (m.all_m) storeu(p); } + void storeu(real_t* p, size_t ioff, mask_t const& m) const + { + storeu(p+ioff, m); + } + + + intvec_t as_int() const { return FP::as_int(v); } intvec_t convert_int() const { return MF::vml_convert_int(v); } diff --git a/vec_double_avx.h b/vec_double_avx.h index 1cc9b9a..7afd41e 100644 --- a/vec_double_avx.h +++ b/vec_double_avx.h @@ -152,6 +152,7 @@ namespace vecmathlib { intvec(ivector_t x): v(x) {} intvec(int_t a): v(_mm256_set1_epi64x(a)) {} intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {} + static intvec iota() { return _mm256_set_epi64x(3, 2, 1, 0); } operator ivector_t() const { return v; } int_t operator[](int n) const @@ -361,6 +362,22 @@ namespace vecmathlib { { return (*this ^ x).convert_bool(); } + boolvec_t operator<(intvec const& x) const + { + return (*this - x).as_bool(); + } + boolvec_t operator<=(intvec const& x) const + { + return ! (*this > x); + } + boolvec_t operator>(intvec const& x) const + { + return x < *this; + } + boolvec_t operator>=(intvec const& x) const + { + return ! (*this < x); + } }; @@ -422,6 +439,91 @@ namespace vecmathlib { + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + return _mm256_load_pd(p); + } + static realvec_t loadu(real_t const* p) + { + return _mm256_loadu_pd(p); + } + static realvec_t loadu(real_t const* p, size_t ioff) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return loada(p); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return loada(p, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + _mm256_store_pd(p, v); + } + void storeu(real_t* p) const + { + return _mm256_storeu_pd(p, v); + } + void storeu(real_t* p, size_t ioff) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return storea(p); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + _mm256_maskstore_pd(p, m.m.as_int(), v); + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + if (m.m[2]) p[2] = (*this)[2]; + if (m.m[3]) p[3] = (*this)[3]; + } + } + void storeu(real_t* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return storea(p, m); + storeu(p+ioff, m); + } + + + intvec_t as_int() const { return _mm256_castpd_si256(v); } intvec_t convert_int() const { return MF::vml_convert_int(*this); } diff --git a/vec_double_sse2.h b/vec_double_sse2.h index 83ff66a..fd82ec0 100644 --- a/vec_double_sse2.h +++ b/vec_double_sse2.h @@ -17,6 +17,9 @@ #ifdef __SSE4A__ // AMD's SSE 4a # include <ammintrin.h> #endif +#if defined __AVX__ // Intel's AVX +# include <immintrin.h> +#endif @@ -263,6 +266,22 @@ namespace vecmathlib { { return (*this ^ x).convert_bool(); } + boolvec_t operator<(intvec const& x) const + { + return (*this - x).as_bool(); + } + boolvec_t operator<=(intvec const& x) const + { + return ! (*this > x); + } + boolvec_t operator>(intvec const& x) const + { + return x < *this; + } + boolvec_t operator>=(intvec const& x) const + { + return ! (*this < x); + } }; @@ -323,6 +342,94 @@ namespace vecmathlib { + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + return _mm_load_pd(p); + } + static realvec_t loadu(real_t const* p) + { + return _mm_loadu_pd(p); + } + static realvec_t loadu(real_t const* p, size_t ioff) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return loada(p); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return loada(p, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + _mm_store_pd(p, v); + } + void storeu(real_t* p) const + { + return _mm_storeu_pd(p, v); + } + void storeu(real_t* p, size_t ioff) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return storea(p); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { +#if defined __AVX__ + _mm_maskstore_pd(p, m.m.as_int(), v); +#else + if (m.m[0]) _mm_storel_pd(p , v); + else if (m.m[1]) _mm_storeh_pd(p+1, v); +#endif + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) _mm_storel_pd(p , v); + else if (m.m[1]) _mm_storeh_pd(p+1, v); + } + } + void storeu(real_t* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return storea(p, m); + storeu(p+ioff, m); + } + + + intvec_t as_int() const { return _mm_castpd_si128(v); } intvec_t convert_int() const { return MF::vml_convert_int(*this); } diff --git a/vec_float.h b/vec_float.h index 4b32b8b..5457ec7 100644 --- a/vec_float.h +++ b/vec_float.h @@ -219,6 +219,36 @@ namespace vecmathlib { + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const* p) { return *p; } + static realvec_t loadu(real_t const* p) { return *p; } + static realvec_t loadu(real_t const* p, size_t ioff) { return p[ioff]; } + realvec_t loada(real_t const* p, mask_t const& m) const + { + return m.m.ifthen(loada(p), *this); + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + return m.m.ifthen(loadu(p), *this); + } + realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const + { + return loadu(p+ioff, m); + } + + void storea(real_t* p) const { *p=v; } + void storeu(real_t* p) const { *p=v; } + void storeu(real_t* p, size_t ioff) const { p[ioff]=v; } + void storea(real_t* p, mask_t const& m) const { if (m.all_m) storea(p); } + void storeu(real_t* p, mask_t const& m) const { if (m.all_m) storeu(p); } + void storeu(real_t* p, size_t ioff, mask_t const& m) const + { + storeu(p+ioff, m); + } + + + intvec_t as_int() const { return FP::as_int(v); } intvec_t convert_int() const { return MF::vml_convert_int(v); } diff --git a/vec_float_avx.h b/vec_float_avx.h index a17c473..b7f6150 100644 --- a/vec_float_avx.h +++ b/vec_float_avx.h @@ -404,6 +404,89 @@ namespace vecmathlib { + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + return _mm256_load_ps(p); + } + static realvec_t loadu(real_t const* p) + { + return _mm256_loadu_ps(p); + } + static realvec_t loadu(real_t const* p, size_t ioff) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return loada(p); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return loada(p, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + _mm256_store_ps(p, v); + } + void storeu(real_t* p) const + { + return _mm256_storeu_ps(p, v); + } + void storeu(real_t* p, size_t ioff) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return storea(p); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + _mm256_maskstore_ps(p, m.m.as_int(), v); + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + // TODO: this is expensive + for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n]; + } + } + void storeu(real_t* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return storea(p, m); + storeu(p+ioff, m); + } + + + intvec_t as_int() const { return _mm256_castps_si256(v); } intvec_t convert_int() const { return _mm256_cvtps_epi32(v); } diff --git a/vec_float_sse2.h b/vec_float_sse2.h index 3ca7c57..f105cc5 100644 --- a/vec_float_sse2.h +++ b/vec_float_sse2.h @@ -17,6 +17,9 @@ #if defined __SSE4A__ // AMD's SSE 4a # include <ammintrin.h> #endif +#if defined __AVX__ // Intel's AVX +# include <immintrin.h> +#endif @@ -308,6 +311,98 @@ namespace vecmathlib { + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + return _mm_load_ps(p); + } + static realvec_t loadu(real_t const* p) + { + return _mm_loadu_ps(p); + } + static realvec_t loadu(real_t const* p, size_t ioff) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return loada(p); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return loada(p, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + _mm_store_ps(p, v); + } + void storeu(real_t* p) const + { + return _mm_storeu_ps(p, v); + } + void storeu(real_t* p, size_t ioff) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return storea(p); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { +#if defined __AVX__ + _mm_maskstore_ps(p, m.m.as_int(), v); +#else + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + if (m.m[2]) p[2] = (*this)[2]; + if (m.m[3]) p[3] = (*this)[3]; +#endif + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + if (m.m[2]) p[2] = (*this)[2]; + if (m.m[3]) p[3] = (*this)[3]; + } + } + void storeu(real_t* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return storea(p, m); + storeu(p+ioff, m); + } + + + intvec_t as_int() const { return _mm_castps_si128(v); } intvec_t convert_int() const { return _mm_cvtps_epi32(v); } diff --git a/vec_mask.h b/vec_mask.h new file mode 100644 index 0000000..cbfcbf7 --- /dev/null +++ b/vec_mask.h @@ -0,0 +1,54 @@ +// -*-C++-*- + +#ifndef VEC_MASK_H +#define VEC_MASK_H + +#include <cstdlib> + + + +namespace vecmathlib { + + template<typename realvec_t> + class mask_t { + friend realvec_t; + typedef typename realvec_t::boolvec_t boolvec_t; + typedef typename realvec_t::intvec_t intvec_t; + static int const size = realvec_t::size; + + std::ptrdiff_t imin, imax; + std::ptrdiff_t i; + boolvec_t m; + bool all_m; + + public: + mask_t(boolvec_t m_): m(m_), all_m(all(m)) {} + mask_t(std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff): + imin(imin_), imax(imax_), + i(imin - (ioff + imin) % size) + { + all_m = i>=imin && i<=imax-size; + if (__builtin_expect(all_m, true)) { + m = true; + } else { + m = (intvec_t(i) >= intvec_t(imin ) - intvec_t::iota() && + intvec_t(i) <= intvec_t(imax-size) - intvec_t::iota()); + } + } + std::ptrdiff_t index() const { return i; } + operator bool() const { return i >= imax; } + void operator++() + { + i += size; + all_m = i<=imax-size; + if (__builtin_expect(all_m, true)) { + m = true; + } else { + m = intvec_t(i) <= intvec_t(imax-size) - intvec_t::iota(); + } + } + }; + +} // namespace vecmathlib + +#endif // #ifndef VEC_MASK_H diff --git a/vec_pseudo.h b/vec_pseudo.h index 1273a19..2df14f0 100644 --- a/vec_pseudo.h +++ b/vec_pseudo.h @@ -7,6 +7,7 @@ #include "mathfuncs.h" #include "vec_base.h" +#include <algorithm> #include <cmath> #include <string> #include <typeinfo> @@ -416,6 +417,68 @@ namespace vecmathlib { + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + return loadu(p); + } + static realvec_t loadu(real_t const* p) + { + realvec_t res; + for (int d=0; d<size; ++d) res.v[d] = p[d]; + return res; + } + static realvec_t loadu(real_t const* p, size_t ioff) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + return m.m.ifthen(loada(p), *this); + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + return m.m.ifthen(loadu(p), *this); + } + realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const + { + return m.m.ifthen(loadu(p, ioff), *this); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + storeu(p); + } + void storeu(real_t* p) const + { + for (int d=0; d<size; ++d) p[d] = v[d]; + } + void storeu(real_t* p, size_t ioff) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + storeu(p, m); + } + void storeu(real_t* p, mask_t const& m) const + { + for (int d=0; d<size; ++d) if (m.m[d]) p[d] = v[d]; + } + void storeu(real_t* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + storeu(p+ioff, m); + } + + + intvec_t as_int() const { intvec_t res; @@ -779,6 +842,74 @@ namespace vecmathlib { // realpseudovec wrappers template<typename real_t, int size> + inline realpseudovec<real_t, size> + loada(real_t const* p, + realpseudovec<real_t, size> x, + typename realpseudovec<real_t, size>::mask_t const& m) + { + return x.loada(p, m); + } + + template<typename real_t, int size> + inline realpseudovec<real_t, size> + loadu(real_t const* p, + realpseudovec<real_t, size> x, + typename realpseudovec<real_t, size>::mask_t const& m) + { + return x.loadu(p, m); + } + + template<typename real_t, int size> + inline realpseudovec<real_t, size> + loadu(real_t const* p, size_t ioff, + realpseudovec<real_t, size> x, + typename realpseudovec<real_t, size>::mask_t const& m) + { + return x.loadu(p, ioff, m); + } + + template<typename real_t, int size> + inline void storea(realpseudovec<real_t, size> x, real_t* p) + { + return x.storea(p); + } + + template<typename real_t, int size> + inline void storeu(realpseudovec<real_t, size> x, real_t* p) + { + return x.storeu(p); + } + + template<typename real_t, int size> + inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff) + { + return x.storeu(p, ioff); + } + + template<typename real_t, int size> + inline void storea(realpseudovec<real_t, size> x, real_t* p, + typename realpseudovec<real_t, size>::mask_t const& m) + { + return x.storea(p, m); + } + + template<typename real_t, int size> + inline void storeu(realpseudovec<real_t, size> x, real_t* p, + typename realpseudovec<real_t, size>::mask_t const& m) + { + return x.storeu(p, m); + } + + template<typename real_t, int size> + inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff, + typename realpseudovec<real_t, size>::mask_t const& m) + { + return x.storeu(p, ioff, m); + } + + + + template<typename real_t, int size> inline intpseudovec<real_t, size> as_int(realpseudovec<real_t, size> x) { return x.as_int(); |