11 files changed, 858 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index 99bfbed..7854e42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ Makefile
 bench
 rules.ninja
 build.ninja
+loop
diff --git a/test.cc b/test.cc
index 3d2ef88..bb174c3 100644
--- a/test.cc
+++ b/test.cc
@@ -71,6 +71,51 @@ struct vecmathlib_test {
   
   
   
+  static void check_mem(char const* const func,
+                        real_t const* p,
+                        realvec_t x,
+                        realvec_t xorig,
+                        int mval)
+  {
+    realvec_t y;
+    for (int i=0; i<realvec_t::size; ++i) {
+      y.set_elt(i, mval & (1<<i) ? p[i] : xorig[i]);
+    }
+    boolvec_t isbad = x != y;
+    if (any(isbad)) {
+      ++ num_errors;
+      cout << setprecision(realvec_t::digits10+2)
+           << "Error in " << func << ":\n"
+           << "   found=" << x << "\n"
+           << "   expected=" << y << "\n"
+           << "   isbad=" << isbad << "\n"
+           << flush;
+    }
+  }
+  
+  static void check_mem(char const* const func,
+                        real_t const* p,
+                        realvec_t x,
+                        real_t const* porig,
+                        int mval)
+  {
+    realvec_t pvec, y;
+    for (int i=0; i<realvec_t::size; ++i) {
+      pvec.set_elt(i, p[i]);
+      y.set_elt(i, mval & (1<<i) ? x[i] : porig[i]);
+    }
+    boolvec_t isbad = pvec != y;
+    if (any(isbad)) {
+      ++ num_errors;
+      cout << setprecision(realvec_t::digits10+2)
+           << "Error in " << func << ":\n"
+           << "   found=" << pvec << "\n"
+           << "   expected=" << y << "\n"
+           << "   isbad=" << isbad << "\n"
+           << flush;
+    }
+  }
+  
   template<typename A>
   static void check(char const* const func,
                     real_t fstd(typename A::scalar_t), realvec_t fvml(A),
@@ -214,6 +259,114 @@ struct vecmathlib_test {
   
   
   
+  static void test_mem()
+  {
+    cout << "   testing loada loadu storea storeu (errors may lead to segfaults)...\n" << flush;
+    int const n = 6;
+    realvec_t x[n], xnew[n];
+    for (int i=0; i<n; ++i) {
+      x[i] = random(R(-10.0), R(+10.0));
+    }
+    realvec_t const z = random(R(-10.0), R(+10.0));
+    
+    // loada
+    {
+      real_t *p = (real_t*)&x[1];
+      realvec_t y = realvec_t::loada(p);
+      check_mem("loada", p, y, z, ~0);
+    }
+    
+    // loadu
+    for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+      real_t *p = (real_t*)&x[1];
+      realvec_t y = realvec_t::loadu(p+i);
+      check_mem("loadu", p+i, y, z, ~0);
+    }
+    
+    // loadu(ioff)
+    for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+      real_t *p = (real_t*)&x[1];
+      realvec_t y = realvec_t::loadu(p, ioff);
+      check_mem("loadu(ioff)", p+ioff, y, z, ~0);
+    }
+    
+    // storea
+    {
+      memcpy(xnew, x, n*sizeof *xnew);
+      real_t *p = (real_t*)&xnew[1];
+      storea(z, p);
+      check_mem("storea", p, z, (real_t*)&x[1], ~0);
+    }
+    
+    // storeu
+    for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+      memcpy(xnew, x, n*sizeof *xnew);
+      real_t *p = (real_t*)&xnew[1];
+      storeu(z, p+i);
+      check_mem("storeu", p+i, z, (real_t*)&x[1]+i, ~0);
+    }
+    
+    // storeu
+    for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+      memcpy(xnew, x, n*sizeof *xnew);
+      real_t *p = (real_t*)&xnew[1];
+      storeu(z, p, ioff);
+      check_mem("storeu(ioff)", p+ioff, z, (real_t*)&x[1]+ioff, ~0);
+    }
+    
+    for (int mval=0; mval<(1<<realvec_t::size); ++mval) {
+      boolvec_t mbool;
+      for (int i=0; i<realvec_t::size; ++i) mbool.set_elt(i, mval & (1<<i));
+      typename realvec_t::mask_t mask(mbool);
+      
+      // loada(mask)
+      {
+        real_t *p = (real_t*)&x[1];
+        realvec_t y = loada(p, z, mask);
+        check_mem("loada(mask)", p, y, z, mval);
+      }
+      
+      // loadu(mask)
+      for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+        real_t *p = (real_t*)&x[1];
+        realvec_t y = loadu(p+i, z, mask);
+        check_mem("loadu(mask)", p+i, y, z, mval);
+      }
+      
+      // loadu(ioff, mask)
+      for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+        real_t *p = (real_t*)&x[1];
+        realvec_t y = loadu(p, ioff, z, mask);
+        check_mem("loadu(ioff,mask)", p+ioff, y, z, mval);
+      }
+      
+      // storea
+      {
+        memcpy(xnew, x, n*sizeof *xnew);
+        real_t *p = (real_t*)&xnew[1];
+        storea(z, p, mask);
+        check_mem("storea(mask)", p, z, (real_t*)&x[1], mval);
+      }
+      
+      // storeu
+      for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+        memcpy(xnew, x, n*sizeof *xnew);
+        real_t *p = (real_t*)&xnew[1];
+        storeu(z, p+i, mask);
+        check_mem("storeu(mask)", p+i, z, (real_t*)&x[1]+i, mval);
+      }
+      
+      // storeu
+      for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+        memcpy(xnew, x, n*sizeof *xnew);
+        real_t *p = (real_t*)&xnew[1];
+        storeu(z, p, ioff, mask);
+        check_mem("storeu(ioff,mask)", p+ioff, z, (real_t*)&x[1]+ioff, mval);
+      }
+      
+    } // for mval
+  }
+  
   static int_t ilogb(real_t x) { return std::ilogb(x); }
   static real_t scalbn(real_t x, int_t n) { return std::scalbn(x, n); }
   static void test_fabs()
@@ -406,6 +559,8 @@ struct vecmathlib_test {
     cout << "\n"
          << "Testing math functions for type " << realvec_t::name() << ":\n";
     
+    test_mem();
+    
     test_fabs();
     test_convert();
     
diff --git a/vec_base.h b/vec_base.h
index ac67ae6..78802f5 100644
--- a/vec_base.h
+++ b/vec_base.h
@@ -5,6 +5,8 @@
 
 #include <iostream>
 
+#include "vec_mask.h"
+
 
 
 namespace vecmathlib {
@@ -108,6 +110,74 @@ namespace vecmathlib {
   // realvec wrappers
   
   template<typename real_t, int size>
+  inline realvec<real_t, size>
+  loada(real_t const* p,
+        realvec<real_t, size> x,
+        typename realvec<real_t, size>::mask_t const& m)
+  {
+    return x.loada(p, m);
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size>
+  loadu(real_t const* p,
+        realvec<real_t, size> x,
+        typename realvec<real_t, size>::mask_t const& m)
+  {
+    return x.loadu(p, m);
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size>
+  loadu(real_t const* p, size_t ioff,
+        realvec<real_t, size> x,
+        typename realvec<real_t, size>::mask_t const& m)
+  {
+    return x.loadu(p, ioff, m);
+  }
+  
+  template<typename real_t, int size>
+  inline void storea(realvec<real_t, size> x, real_t* p)
+  {
+    return x.storea(p);
+  }
+  
+  template<typename real_t, int size>
+  inline void storeu(realvec<real_t, size> x, real_t* p)
+  {
+    return x.storeu(p);
+  }
+  
+  template<typename real_t, int size>
+  inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff)
+  {
+    return x.storeu(p, ioff);
+  }
+  
+  template<typename real_t, int size>
+  inline void storea(realvec<real_t, size> x, real_t* p,
+                     typename realvec<real_t, size>::mask_t const& m)
+  {
+    return x.storea(p, m);
+  }
+  
+  template<typename real_t, int size>
+  inline void storeu(realvec<real_t, size> x, real_t* p,
+                     typename realvec<real_t, size>::mask_t const& m)
+  {
+    return x.storeu(p, m);
+  }
+  
+  template<typename real_t, int size>
+  inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff,
+                     typename realvec<real_t, size>::mask_t const &m)
+  {
+    return x.storeu(p, ioff, m);
+  }
+  
+  
+  
+  template<typename real_t, int size>
   inline intvec<real_t, size> as_int(realvec<real_t, size> x)
   {
     return x.as_int();
diff --git a/vec_double.h b/vec_double.h
index b2817f5..8559e28 100644
--- a/vec_double.h
+++ b/vec_double.h
@@ -219,6 +219,36 @@ namespace vecmathlib {
     
     
     
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p) { return *p; }
+    static realvec_t loadu(real_t const* p) { return *p; }
+    static realvec_t loadu(real_t const* p, size_t ioff) { return p[ioff]; }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      return m.m.ifthen(loada(p), *this);
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      return m.m.ifthen(loadu(p), *this);
+    }
+    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+    {
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const { *p=v; }
+    void storeu(real_t* p) const { *p=v; }
+    void storeu(real_t* p, size_t ioff) const { p[ioff]=v; }
+    void storea(real_t* p, mask_t const& m) const { if (m.all_m) storea(p); }
+    void storeu(real_t* p, mask_t const& m) const { if (m.all_m) storeu(p); }
+    void storeu(real_t* p, size_t ioff, mask_t const& m) const
+    {
+      storeu(p+ioff, m);
+    }
+    
+    
+    
     intvec_t as_int() const { return FP::as_int(v); }
     intvec_t convert_int() const { return MF::vml_convert_int(v); }
     
diff --git a/vec_double_avx.h b/vec_double_avx.h
index 1cc9b9a..7afd41e 100644
--- a/vec_double_avx.h
+++ b/vec_double_avx.h
@@ -152,6 +152,7 @@ namespace vecmathlib {
     intvec(ivector_t x): v(x) {}
     intvec(int_t a): v(_mm256_set1_epi64x(a)) {}
     intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {}
+    static intvec iota() { return _mm256_set_epi64x(3, 2, 1, 0); }
     
     operator ivector_t() const { return v; }
     int_t operator[](int n) const
@@ -361,6 +362,22 @@ namespace vecmathlib {
     {
       return (*this ^ x).convert_bool();
     }
+    boolvec_t operator<(intvec const& x) const
+    {
+      return (*this - x).as_bool();
+    }
+    boolvec_t operator<=(intvec const& x) const
+    {
+      return ! (*this > x);
+    }
+    boolvec_t operator>(intvec const& x) const
+    {
+      return x < *this;
+    }
+    boolvec_t operator>=(intvec const& x) const
+    {
+      return ! (*this < x);
+    }
   };
   
   
@@ -422,6 +439,91 @@ namespace vecmathlib {
     
     
     
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      return _mm256_load_pd(p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return _mm256_loadu_pd(p);
+    }
+    static realvec_t loadu(real_t const* p, size_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return loada(p);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return loada(p, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      _mm256_store_pd(p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      return _mm256_storeu_pd(p, v);
+    }
+    void storeu(real_t* p, size_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return storea(p);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+        _mm256_maskstore_pd(p, m.m.as_int(), v);
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+        if (m.m[2]) p[2] = (*this)[2];
+        if (m.m[3]) p[3] = (*this)[3];
+      }
+    }
+    void storeu(real_t* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return storea(p, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
     intvec_t as_int() const { return _mm256_castpd_si256(v); }
     intvec_t convert_int() const { return MF::vml_convert_int(*this); }
     
diff --git a/vec_double_sse2.h b/vec_double_sse2.h
index 83ff66a..fd82ec0 100644
--- a/vec_double_sse2.h
+++ b/vec_double_sse2.h
@@ -17,6 +17,9 @@
 #ifdef __SSE4A__                // AMD's SSE 4a
 #  include <ammintrin.h>
 #endif
+#if defined __AVX__             // Intel's AVX
+#  include <immintrin.h>
+#endif
 
 
 
@@ -263,6 +266,22 @@ namespace vecmathlib {
     {
       return (*this ^ x).convert_bool();
     }
+    boolvec_t operator<(intvec const& x) const
+    {
+      return (*this - x).as_bool();
+    }
+    boolvec_t operator<=(intvec const& x) const
+    {
+      return ! (*this > x);
+    }
+    boolvec_t operator>(intvec const& x) const
+    {
+      return x < *this;
+    }
+    boolvec_t operator>=(intvec const& x) const
+    {
+      return ! (*this < x);
+    }
   };
   
   
@@ -323,6 +342,94 @@ namespace vecmathlib {
     
     
     
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      return _mm_load_pd(p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return _mm_loadu_pd(p);
+    }
+    static realvec_t loadu(real_t const* p, size_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return loada(p);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return loada(p, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      _mm_store_pd(p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      return _mm_storeu_pd(p, v);
+    }
+    void storeu(real_t* p, size_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return storea(p);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+#if defined __AVX__
+        _mm_maskstore_pd(p, m.m.as_int(), v);
+#else
+        if      (m.m[0]) _mm_storel_pd(p  , v);
+        else if (m.m[1]) _mm_storeh_pd(p+1, v);
+#endif
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        if      (m.m[0]) _mm_storel_pd(p  , v);
+        else if (m.m[1]) _mm_storeh_pd(p+1, v);
+      }
+    }
+    void storeu(real_t* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return storea(p, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
     intvec_t as_int() const { return _mm_castpd_si128(v); }
     intvec_t convert_int() const { return MF::vml_convert_int(*this); }
     
diff --git a/vec_float.h b/vec_float.h
index 4b32b8b..5457ec7 100644
--- a/vec_float.h
+++ b/vec_float.h
@@ -219,6 +219,36 @@ namespace vecmathlib {
     
     
     
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p) { return *p; }
+    static realvec_t loadu(real_t const* p) { return *p; }
+    static realvec_t loadu(real_t const* p, size_t ioff) { return p[ioff]; }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      return m.m.ifthen(loada(p), *this);
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      return m.m.ifthen(loadu(p), *this);
+    }
+    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+    {
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const { *p=v; }
+    void storeu(real_t* p) const { *p=v; }
+    void storeu(real_t* p, size_t ioff) const { p[ioff]=v; }
+    void storea(real_t* p, mask_t const& m) const { if (m.all_m) storea(p); }
+    void storeu(real_t* p, mask_t const& m) const { if (m.all_m) storeu(p); }
+    void storeu(real_t* p, size_t ioff, mask_t const& m) const
+    {
+      storeu(p+ioff, m);
+    }
+    
+    
+    
     intvec_t as_int() const { return FP::as_int(v); }
     intvec_t convert_int() const { return MF::vml_convert_int(v); }
     
diff --git a/vec_float_avx.h b/vec_float_avx.h
index a17c473..b7f6150 100644
--- a/vec_float_avx.h
+++ b/vec_float_avx.h
@@ -404,6 +404,89 @@ namespace vecmathlib {
     
     
     
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      return _mm256_load_ps(p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return _mm256_loadu_ps(p);
+    }
+    static realvec_t loadu(real_t const* p, size_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return loada(p);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return loada(p, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      _mm256_store_ps(p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      return _mm256_storeu_ps(p, v);
+    }
+    void storeu(real_t* p, size_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return storea(p);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+        _mm256_maskstore_ps(p, m.m.as_int(), v);
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        // TODO: this is expensive
+        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+      }
+    }
+    void storeu(real_t* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return storea(p, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
     intvec_t as_int() const { return _mm256_castps_si256(v); }
     intvec_t convert_int() const { return _mm256_cvtps_epi32(v); }
     
diff --git a/vec_float_sse2.h b/vec_float_sse2.h
index 3ca7c57..f105cc5 100644
--- a/vec_float_sse2.h
+++ b/vec_float_sse2.h
@@ -17,6 +17,9 @@
 #if defined __SSE4A__           // AMD's SSE 4a
 #  include <ammintrin.h>
 #endif
+#if defined __AVX__             // Intel's AVX
+#  include <immintrin.h>
+#endif
 
 
 
@@ -308,6 +311,98 @@ namespace vecmathlib {
     
     
     
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      return _mm_load_ps(p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return _mm_loadu_ps(p);
+    }
+    static realvec_t loadu(real_t const* p, size_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return loada(p);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return loada(p, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      _mm_store_ps(p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      return _mm_storeu_ps(p, v);
+    }
+    void storeu(real_t* p, size_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return storea(p);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+#if defined __AVX__
+        _mm_maskstore_ps(p, m.m.as_int(), v);
+#else
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+        if (m.m[2]) p[2] = (*this)[2];
+        if (m.m[3]) p[3] = (*this)[3];
+#endif
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+        if (m.m[2]) p[2] = (*this)[2];
+        if (m.m[3]) p[3] = (*this)[3];
+      }
+    }
+    void storeu(real_t* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return storea(p, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
     intvec_t as_int() const { return _mm_castps_si128(v); }
     intvec_t convert_int() const { return _mm_cvtps_epi32(v); }
     
diff --git a/vec_mask.h b/vec_mask.h
new file mode 100644
index 0000000..cbfcbf7
--- /dev/null
+++ b/vec_mask.h
@@ -0,0 +1,54 @@
+// -*-C++-*-
+
+#ifndef VEC_MASK_H
+#define VEC_MASK_H
+
+#include <cstdlib>
+
+
+
+namespace vecmathlib {
+  
+  template<typename realvec_t>
+  class mask_t {
+    friend realvec_t;
+    typedef typename realvec_t::boolvec_t boolvec_t;
+    typedef typename realvec_t::intvec_t intvec_t;
+    static int const size = realvec_t::size;
+    
+    std::ptrdiff_t imin, imax;
+    std::ptrdiff_t i;
+    boolvec_t m;
+    bool all_m;
+    
+  public:
+    mask_t(boolvec_t m_): m(m_), all_m(all(m)) {}
+    mask_t(std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff):
+      imin(imin_), imax(imax_),
+      i(imin - (ioff + imin) % size)
+    {
+      all_m = i>=imin && i<=imax-size;
+      if (__builtin_expect(all_m, true)) {
+        m = true;
+      } else {
+        m = (intvec_t(i) >= intvec_t(imin     ) - intvec_t::iota() &&
+             intvec_t(i) <= intvec_t(imax-size) - intvec_t::iota());
+      }
+    }
+    std::ptrdiff_t index() const { return i; }
+    operator bool() const { return i >= imax; }
+    void operator++()
+    {
+      i += size;
+      all_m = i<=imax-size;
+      if (__builtin_expect(all_m, true)) {
+        m = true;
+      } else {
+        m = intvec_t(i) <= intvec_t(imax-size) - intvec_t::iota();
+      }
+    }
+  };
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_MASK_H
diff --git a/vec_pseudo.h b/vec_pseudo.h
index 1273a19..2df14f0 100644
--- a/vec_pseudo.h
+++ b/vec_pseudo.h
@@ -7,6 +7,7 @@
 #include "mathfuncs.h"
 #include "vec_base.h"
 
+#include <algorithm>
 #include <cmath>
 #include <string>
 #include <typeinfo>
@@ -416,6 +417,68 @@ namespace vecmathlib {
     
     
     
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      return loadu(p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      realvec_t res;
+      for (int d=0; d<size; ++d) res.v[d] = p[d];
+      return res;
+    }
+    static realvec_t loadu(real_t const* p, size_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      return m.m.ifthen(loada(p), *this);
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      return m.m.ifthen(loadu(p), *this);
+    }
+    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+    {
+      return m.m.ifthen(loadu(p, ioff), *this);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      storeu(p);
+    }
+    void storeu(real_t* p) const
+    {
+      for (int d=0; d<size; ++d) p[d] = v[d];
+    }
+    void storeu(real_t* p, size_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      storeu(p, m);
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      for (int d=0; d<size; ++d) if (m.m[d]) p[d] = v[d];
+    }
+    void storeu(real_t* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
     intvec_t as_int() const
     {
       intvec_t res;
@@ -779,6 +842,74 @@ namespace vecmathlib {
   // realpseudovec wrappers
   
   template<typename real_t, int size>
+  inline realpseudovec<real_t, size>
+  loada(real_t const* p,
+        realpseudovec<real_t, size> x,
+        typename realpseudovec<real_t, size>::mask_t const& m)
+  {
+    return x.loada(p, m);
+  }
+  
+  template<typename real_t, int size>
+  inline realpseudovec<real_t, size>
+  loadu(real_t const* p,
+        realpseudovec<real_t, size> x,
+        typename realpseudovec<real_t, size>::mask_t const& m)
+  {
+    return x.loadu(p, m);
+  }
+  
+  template<typename real_t, int size>
+  inline realpseudovec<real_t, size>
+  loadu(real_t const* p, size_t ioff,
+        realpseudovec<real_t, size> x,
+        typename realpseudovec<real_t, size>::mask_t const& m)
+  {
+    return x.loadu(p, ioff, m);
+  }
+  
+  template<typename real_t, int size>
+  inline void storea(realpseudovec<real_t, size> x, real_t* p)
+  {
+    return x.storea(p);
+  }
+  
+  template<typename real_t, int size>
+  inline void storeu(realpseudovec<real_t, size> x, real_t* p)
+  {
+    return x.storeu(p);
+  }
+  
+  template<typename real_t, int size>
+  inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff)
+  {
+    return x.storeu(p, ioff);
+  }
+  
+  template<typename real_t, int size>
+  inline void storea(realpseudovec<real_t, size> x, real_t* p,
+                     typename realpseudovec<real_t, size>::mask_t const& m)
+  {
+    return x.storea(p, m);
+  }
+  
+  template<typename real_t, int size>
+  inline void storeu(realpseudovec<real_t, size> x, real_t* p,
+                     typename realpseudovec<real_t, size>::mask_t const& m)
+  {
+    return x.storeu(p, m);
+  }
+  
+  template<typename real_t, int size>
+  inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff,
+                     typename realpseudovec<real_t, size>::mask_t const& m)
+  {
+    return x.storeu(p, ioff, m);
+  }
+  
+  
+  
+  template<typename real_t, int size>
   inline intpseudovec<real_t, size> as_int(realpseudovec<real_t, size> x)
   {
     return x.as_int();