6 files changed, 1373 insertions, 31 deletions
diff --git a/floatprops.h b/floatprops.h
index 8d4ddcf..c39c788 100644
--- a/floatprops.h
+++ b/floatprops.h
@@ -3,6 +3,8 @@
 #ifndef FLOATPROPS_H
 #define FLOATPROPS_H
 
+#include "floattypes.h"
+
 #include <cmath>
 #include <cstdint>
 #include <cstring>
@@ -27,8 +29,116 @@ namespace vecmathlib {
     //    max_exponent
   };
   
-  template<typename int_t>
-  struct intprops {
+  
+  
+  // Properties of fp8
+  template<>
+  struct floatprops<fp8> {
+    typedef fp8 real_t;
+    typedef int8_t int_t;
+    typedef uint8_t uint_t;
+    
+    // Definitions that might come from numeric_limits<> instead:
+    static int const digits = 4;
+    static int epsilon() { __builtin_unreachable(); }
+    static int const min_exponent = -6;
+    static int const max_exponent = 7;
+    
+    // Ensure the sizes match
+    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+    
+    // Number of bits in internal representation
+    static int const bits = 8 * sizeof(real_t);
+    static int const mantissa_bits = digits - 1;
+    static int const signbit_bits = 1;
+    static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+    static int const exponent_offset = 2 - min_exponent;
+    static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+                  "error in bit counts");
+    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+    static uint_t const exponent_mask =
+      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
+    static uint_t const signbit_mask = uint_t(1) << (bits-1);
+    static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+                  "error in masks");
+    static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+                  uint_t(~uint_t(0)),
+                  "error in masks");
+    
+    // Re-interpret bit patterns
+    static real_t as_float(int_t x)
+    {
+      real_t res;
+      std::memcpy(&res, &x, sizeof res);
+      return res;
+    }
+    static int_t as_int(real_t x)
+    {
+      int_t res;
+      std::memcpy(&res, &x, sizeof res);
+      return res;
+    }
+    
+    // Convert values
+    static real_t convert_float(int_t x) { __builtin_unreachable(); }
+    static int_t convert_int(real_t x) { __builtin_unreachable(); }
+  };
+  
+  
+  
+  // Properties of fp16
+  template<>
+  struct floatprops<fp16> {
+    typedef fp16 real_t;
+    typedef int16_t int_t;
+    typedef uint16_t uint_t;
+    
+    // Definitions that might come from numeric_limits<> instead:
+    static int const digits = 11;
+    static int epsilon() { __builtin_unreachable(); }
+    static int const min_exponent = -14;
+    static int const max_exponent = 15;
+    
+    // Ensure the sizes match
+    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+    
+    // Number of bits in internal representation
+    static int const bits = 8 * sizeof(real_t);
+    static int const mantissa_bits = digits - 1;
+    static int const signbit_bits = 1;
+    static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+    static int const exponent_offset = 2 - min_exponent;
+    static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+                  "error in bit counts");
+    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+    static uint_t const exponent_mask =
+      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
+    static uint_t const signbit_mask = uint_t(1) << (bits-1);
+    static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+                  "error in masks");
+    static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+                  uint_t(~uint_t(0)),
+                  "error in masks");
+    
+    // Re-interpret bit patterns
+    static real_t as_float(int_t x)
+    {
+      real_t res;
+      std::memcpy(&res, &x, sizeof res);
+      return res;
+    }
+    static int_t as_int(real_t x)
+    {
+      int_t res;
+      std::memcpy(&res, &x, sizeof res);
+      return res;
+    }
+    
+    // Convert values
+    static real_t convert_float(int_t x) { __builtin_unreachable(); }
+    static int_t convert_int(real_t x) { __builtin_unreachable(); }
   };
   
   
@@ -66,7 +176,7 @@ namespace vecmathlib {
                   "error in masks");
     
     // Re-interpret bit patterns
-    static inline real_t as_float(int_t x)
+    static real_t as_float(int_t x)
     {
       // return *(real_t*)&x;
       // union { int_t i; real_t r; } ir;
@@ -75,7 +185,7 @@ namespace vecmathlib {
       std::memcpy(&res, &x, sizeof res);
       return res;
     }
-    static inline int_t as_int(real_t x)
+    static int_t as_int(real_t x)
     {
       // return *(int_t*)&x;
       // union { real_t r; int_t i; } ri;
@@ -86,8 +196,8 @@ namespace vecmathlib {
     }
     
     // Convert values
-    static inline real_t convert_float(int_t x) { return real_t(x); }
-    static inline int_t convert_int(real_t x)
+    static real_t convert_float(int_t x) { return real_t(x); }
+    static int_t convert_int(real_t x)
     {
       static_assert(sizeof std::lrint(x) >= sizeof(int_t),
                     "lrint() has wrong return type");
@@ -103,11 +213,6 @@ namespace vecmathlib {
     }
   };
   
-  template<>
-  struct intprops<floatprops<float>::int_t> {
-    typedef float real_t;
-  };
-  
   
   
   // Properties of double
@@ -143,7 +248,7 @@ namespace vecmathlib {
                   "error in masks");
     
     // Re-interpret bit patterns
-    static inline real_t as_float(int_t x)
+    static real_t as_float(int_t x)
     {
       // return *(real_t*)&x;
       // union { int_t i; real_t r; } ir;
@@ -152,7 +257,7 @@ namespace vecmathlib {
       std::memcpy(&res, &x, sizeof res);
       return res;
     }
-    static inline int_t as_int(real_t x)
+    static int_t as_int(real_t x)
     {
       // return *(int_t*)&x;
       // union { real_t r; int_t i; } ri;
@@ -163,8 +268,8 @@ namespace vecmathlib {
     }
     
     // Convert values
-    static inline real_t convert_float(int_t x) { return real_t(x); }
-    static inline int_t convert_int(real_t x)
+    static real_t convert_float(int_t x) { return real_t(x); }
+    static int_t convert_int(real_t x)
     {
       static_assert(sizeof std::lrint(x) >= sizeof(int_t),
                     "lrint() has wrong return type");
@@ -180,11 +285,6 @@ namespace vecmathlib {
     }
   };
   
-  template<>
-  struct intprops<floatprops<double>::int_t> {
-    typedef double real_t;
-  };
-  
   
   
 } // namespace vecmathlib
diff --git a/floattypes.h b/floattypes.h
new file mode 100644
index 0000000..008f695
--- /dev/null
+++ b/floattypes.h
@@ -0,0 +1,22 @@
+// -*-C++-*-
+
+#ifndef FLOATTYPESS_H
+#define FLOATTYPESS_H
+
+#include <cstdint>
+
+namespace vecmathlib {
+  
+  struct fp8 {
+    // 1 bit sign, 4 bits exponent, 3 bits mantissa
+    std::uint8_t val;
+  };
+  
+  struct fp16 {
+    // 1 bit sign, 5 bits exponent, 10 bits mantissa
+    std::uint16_t val;
+  };
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef FLOATTYPES_H
diff --git a/mathfuncs_fabs.h b/mathfuncs_fabs.h
index 1050147..952dbef 100644
--- a/mathfuncs_fabs.h
+++ b/mathfuncs_fabs.h
@@ -14,7 +14,7 @@ namespace vecmathlib {
   template<typename realvec_t>
   realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y)
   {
-    intvec_t value = as_int(x) & IV(~FP::signbit_mask);
+    intvec_t value = as_int(x) & IV(U(~FP::signbit_mask));
     intvec_t sign = as_int(y) & IV(FP::signbit_mask);
     return as_float(sign | value);
   }
@@ -22,7 +22,7 @@ namespace vecmathlib {
   template<typename realvec_t>
   realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x)
   {
-    return as_float(as_int(x) & IV(~FP::signbit_mask));
+    return as_float(as_int(x) & IV(U(~FP::signbit_mask)));
   }
   
   template<typename realvec_t>
diff --git a/vec_fp16_avx.h b/vec_fp16_avx.h
new file mode 100644
index 0000000..b52d67c
--- /dev/null
+++ b/vec_fp16_avx.h
@@ -0,0 +1,579 @@
+// -*-C++-*-
+
+#ifndef VEC_FP16_AVX_H
+#define VEC_FP16_AVX_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// AVX intrinsics
+#include <immintrin.h>
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_FP16_16
+  template<> struct boolvec<fp16,16>;
+  template<> struct intvec<fp16,16>;
+  template<> struct realvec<fp16,16>;
+  
+  
+  
+  template<>
+  struct boolvec<fp16,16>: floatprops<fp16>
+  {
+    static int const size = 16;
+    typedef bool scalar_t;
+    typedef __m256i bvector_t;
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // true values have the sign bit set, false values have it unset
+    static uint_t from_bool(bool a) { return - uint_t(a); }
+    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a): v(_mm256_set1_epi16(from_bool(a))) {}
+    boolvec(bool const* as):
+    v(_mm256_set_epi16(from_bool(as[15]),
+                       from_bool(as[14]),
+                       from_bool(as[13]),
+                       from_bool(as[12]),
+                       from_bool(as[11]),
+                       from_bool(as[10]),
+                       from_bool(as[ 9]),
+                       from_bool(as[ 8]),
+                       from_bool(as[ 7]),
+                       from_bool(as[ 6]),
+                       from_bool(as[ 5]),
+                       from_bool(as[ 4]),
+                       from_bool(as[ 3]),
+                       from_bool(as[ 2]),
+                       from_bool(as[ 1]),
+                       from_bool(as[ 0]))) {}
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+    boolvec& set_elt(int n, bool a)
+    {
+      return ((uint_t*)&v)[n]=from_bool(a), *this;
+    }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return *this != boolvec(true); }
+    
+    boolvec operator&&(boolvec x) const 
+    {
+      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    boolvec operator||(boolvec x) const
+    {
+      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+                                              _mm256_castsi256_ps(x.v)));
+    }
+    boolvec operator==(boolvec x) const { return !(*this==x); }
+    boolvec operator!=(boolvec x) const
+    {
+      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    
+    bool all() const
+    {
+      bool r = true;
+      for (int n=0; n<size; ++n) r = r && (*this)[n];
+      return r;
+    }
+    bool any() const
+    {
+      bool r = false;
+      for (int n=0; n<size; ++n) r = r || (*this)[n];
+      return r;
+    }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<fp16,16>: floatprops<fp16>
+  {
+    static int const size = 16;
+    typedef int_t scalar_t;
+    typedef __m256i ivector_t;
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(_mm256_set1_epi16(a)) {}
+    intvec(int_t const* as):
+    v(_mm256_set_epi16(as[15],
+                       as[14],
+                       as[13],
+                       as[12],
+                       as[11],
+                       as[10],
+                       as[ 9],
+                       as[ 8],
+                       as[ 7],
+                       as[ 6],
+                       as[ 5],
+                       as[ 4],
+                       as[ 3],
+                       as[ 2],
+                       as[ 1],
+                       as[ 0])) {}
+    static intvec iota()
+    {
+      return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8,
+                              7, 6, 5, 4, 3, 2, 1, 0);
+    }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+    
+    
+    
+    boolvec_t as_bool() const { return v; }
+    boolvec_t convert_bool() const
+    {
+      // Result: convert_bool(0)=false, convert_bool(else)=true
+      // There is no intrinsic to compare with zero. Instead, we check
+      // whether x is positive and x-1 is negative.
+      intvec x = *this;
+      // We know that boolvec values depend only on the sign bit
+      // return (~(x-1) | x).as_bool();
+      // return x.as_bool() || !(x-1).as_bool();
+      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+    }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    // Note: not all arithmetic operations are supported!
+    
+    intvec operator+() const { return *this; }
+    intvec operator-() const { return IV(I(0)) - *this; }
+    
+    intvec operator+(intvec x) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      __m128i xvlo = _mm256_castsi256_si128(x.v);
+      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+      vlo = _mm_add_epi16(vlo, xvlo);
+      vhi = _mm_add_epi16(vhi, xvhi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator-(intvec x) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      __m128i xvlo = _mm256_castsi256_si128(x.v);
+      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+      vlo = _mm_sub_epi16(vlo, xvlo);
+      vhi = _mm_sub_epi16(vhi, xvhi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    
+    
+    
+    intvec operator~() const { return IV(~U(0)) ^ *this; }
+    
+    intvec operator&(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    intvec operator|(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+                                              _mm256_castsi256_ps(x.v)));
+    }
+    intvec operator^(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      vlo = _mm_srli_epi16(vlo, n);
+      vhi = _mm_srli_epi16(vhi, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator>>(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      vlo = _mm_srai_epi16(vlo, n);
+      vhi = _mm_srai_epi16(vhi, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator<<(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      vlo = _mm_slli_epi16(vlo, n);
+      vhi = _mm_slli_epi16(vhi, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      intvec r = *this;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, U(r[i]) >> U(n[i]));
+      }
+      return r;
+    }
+    intvec operator>>(intvec n) const
+    {
+      intvec r = *this;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, r[i] >> n[i]);
+      }
+      return r;
+    }
+    intvec operator<<(intvec n) const
+    {
+      intvec r = *this;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, r[i] << n[i]);
+      }
+      return r;
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t operator==(intvec const& x) const
+    {
+      return ! (*this != x);
+    }
+    boolvec_t operator!=(intvec const& x) const
+    {
+      return (*this ^ x).convert_bool();
+    }
+  };
+  
+  
+  
+  template<>
+  struct realvec<fp16,16>: floatprops<fp16>
+  {
+    static int const size = 16;
+    typedef real_t scalar_t;
+    typedef __m256i vector_t;
+    
+    static char const* name() { return "<AVX:16*fp16>"; }
+    inline void barrier() { asm("": "+x" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(_mm256_set1_epi16(FP::as_int(a))) {}
+    realvec(real_t const* as):
+    v(_mm256_set_epi16(FP::as_int(as[15]),
+                       FP::as_int(as[14]),
+                       FP::as_int(as[13]),
+                       FP::as_int(as[12]),
+                       FP::as_int(as[11]),
+                       FP::as_int(as[10]),
+                       FP::as_int(as[ 9]),
+                       FP::as_int(as[ 8]),
+                       FP::as_int(as[ 7]),
+                       FP::as_int(as[ 6]),
+                       FP::as_int(as[ 5]),
+                       FP::as_int(as[ 4]),
+                       FP::as_int(as[ 3]),
+                       FP::as_int(as[ 2]),
+                       FP::as_int(as[ 1]),
+                       FP::as_int(as[ 0]))) {}
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      return _mm256_load_si256((__m256i const*)p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return _mm256_loadu_si256((__m256i const*)p);
+    }
+    static realvec_t loadu(real_t const* p, size_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return loada(p);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return loada(p, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      _mm256_store_si256((__m256i*)p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      return _mm256_storeu_si256((__m256i*)p, v);
+    }
+    void storeu(real_t* p, size_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return storea(p);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+        // TODO: this is expensive
+        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        // TODO: this is expensive
+        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+      }
+    }
+    void storeu(real_t* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return storea(p, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return v; }
+    intvec_t convert_int() const { __builtin_unreachable(); }
+    
+    
+    
+    realvec operator+() const { __builtin_unreachable(); }
+    realvec operator-() const { __builtin_unreachable(); }
+    
+    realvec operator+(realvec x) const { __builtin_unreachable(); }
+    realvec operator-(realvec x) const { __builtin_unreachable(); }
+    realvec operator*(realvec x) const { __builtin_unreachable(); }
+    realvec operator/(realvec x) const { __builtin_unreachable(); }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const { __builtin_unreachable(); }
+    real_t sum() const { __builtin_unreachable(); }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
+    
+    
+    
+    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+    realvec fabs() const { return MF::vml_fabs(*this); }
+    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+    boolvec_t isinf() const { return MF::vml_isinf(*this); }
+    boolvec_t isnan() const { return MF::vml_isnan(*this); }
+    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+    boolvec_t signbit() const { return v; }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<fp16,16>::as_int() const -> intvec_t
+  {
+    return v;
+  }
+  
+  inline
+  auto boolvec<fp16,16>::convert_int() const -> intvec_t
+  {
+    return lsr(as_int(), bits-1);
+  }
+  
+  inline
+  auto boolvec<fp16,16>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return ifthen(x.as_float(), y.as_float()).as_int();
+  }
+  
+  inline
+  auto boolvec<fp16,16>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return (( -convert_int() & x.as_int()) |
+            (~-convert_int() & y.as_int())).as_float();
+  }
+
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<fp16,16>::as_float() const -> realvec_t
+  {
+    return v;
+  }
+  
+  inline auto intvec<fp16,16>::convert_float() const -> realvec_t
+  {
+    __builtin_unreachable();
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_FP16_AVX_H
diff --git a/vec_fp8_avx.h b/vec_fp8_avx.h
new file mode 100644
index 0000000..b612f42
--- /dev/null
+++ b/vec_fp8_avx.h
@@ -0,0 +1,645 @@
+// -*-C++-*-
+
+#ifndef VEC_FP8_AVX_H
+#define VEC_FP8_AVX_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// AVX intrinsics
+#include <immintrin.h>
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_FP8_32
+  template<> struct boolvec<fp8,32>;
+  template<> struct intvec<fp8,32>;
+  template<> struct realvec<fp8,32>;
+  
+  
+  
+  template<>
+  struct boolvec<fp8,32>: floatprops<fp8>
+  {
+    static int const size = 32;
+    typedef bool scalar_t;
+    typedef __m256i bvector_t;
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // true values have the sign bit set, false values have it unset
+    static uint_t from_bool(bool a) { return - uint_t(a); }
+    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a): v(_mm256_set1_epi8(from_bool(a))) {}
+    boolvec(bool const* as):
+    v(_mm256_set_epi8(from_bool(as[31]),
+                      from_bool(as[30]),
+                      from_bool(as[29]),
+                      from_bool(as[28]),
+                      from_bool(as[27]),
+                      from_bool(as[26]),
+                      from_bool(as[25]),
+                      from_bool(as[24]),
+                      from_bool(as[23]),
+                      from_bool(as[22]),
+                      from_bool(as[21]),
+                      from_bool(as[20]),
+                      from_bool(as[19]),
+                      from_bool(as[18]),
+                      from_bool(as[17]),
+                      from_bool(as[16]),
+                      from_bool(as[15]),
+                      from_bool(as[14]),
+                      from_bool(as[13]),
+                      from_bool(as[12]),
+                      from_bool(as[11]),
+                      from_bool(as[10]),
+                      from_bool(as[ 9]),
+                      from_bool(as[ 8]),
+                      from_bool(as[ 7]),
+                      from_bool(as[ 6]),
+                      from_bool(as[ 5]),
+                      from_bool(as[ 4]),
+                      from_bool(as[ 3]),
+                      from_bool(as[ 2]),
+                      from_bool(as[ 1]),
+                      from_bool(as[ 0]))) {}
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+    boolvec& set_elt(int n, bool a)
+    {
+      return ((uint_t*)&v)[n]=from_bool(a), *this;
+    }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return *this != boolvec(true); }
+    
+    boolvec operator&&(boolvec x) const 
+    {
+      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    boolvec operator||(boolvec x) const
+    {
+      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+                                              _mm256_castsi256_ps(x.v)));
+    }
+    boolvec operator==(boolvec x) const { return !(*this==x); }
+    boolvec operator!=(boolvec x) const
+    {
+      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    
+    bool all() const
+    {
+      bool r = true;
+      for (int n=0; n<size; ++n) r = r && (*this)[n];
+      return r;
+    }
+    bool any() const
+    {
+      bool r = false;
+      for (int n=0; n<size; ++n) r = r || (*this)[n];
+      return r;
+    }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<fp8,32>: floatprops<fp8>
+  {
+    static int const size = 32;
+    typedef int_t scalar_t;
+    typedef __m256i ivector_t;
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(_mm256_set1_epi8(a)) {}
+    intvec(int_t const* as):
+    v(_mm256_set_epi8(as[31],
+                      as[30],
+                      as[29],
+                      as[28],
+                      as[27],
+                      as[26],
+                      as[25],
+                      as[24],
+                      as[23],
+                      as[22],
+                      as[21],
+                      as[20],
+                      as[19],
+                      as[18],
+                      as[17],
+                      as[16],
+                      as[15],
+                      as[14],
+                      as[13],
+                      as[12],
+                      as[11],
+                      as[10],
+                      as[ 9],
+                      as[ 8],
+                      as[ 7],
+                      as[ 6],
+                      as[ 5],
+                      as[ 4],
+                      as[ 3],
+                      as[ 2],
+                      as[ 1],
+                      as[ 0])) {}
+    static intvec iota()
+    {
+      return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24,
+                             23, 22, 21, 20, 19, 18, 17, 16,
+                             15, 14, 13, 12, 11, 10, 9, 8,
+                             7, 6, 5, 4, 3, 2, 1, 0);
+    }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+    
+    
+    
+    boolvec_t as_bool() const { return v; }
+    boolvec_t convert_bool() const
+    {
+      // Result: convert_bool(0)=false, convert_bool(else)=true
+      // There is no intrinsic to compare with zero. Instead, we check
+      // whether x is positive and x-1 is negative.
+      intvec x = *this;
+      // We know that boolvec values depend only on the sign bit
+      // return (~(x-1) | x).as_bool();
+      // return x.as_bool() || !(x-1).as_bool();
+      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+    }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    // Note: not all arithmetic operations are supported!
+    
+    intvec operator+() const { return *this; }
+    intvec operator-() const { return IV(I(0)) - *this; }
+    
+    intvec operator+(intvec x) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      __m128i xvlo = _mm256_castsi256_si128(x.v);
+      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+      vlo = _mm_add_epi8(vlo, xvlo);
+      vhi = _mm_add_epi8(vhi, xvhi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator-(intvec x) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      __m128i xvlo = _mm256_castsi256_si128(x.v);
+      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+      vlo = _mm_sub_epi8(vlo, xvlo);
+      vhi = _mm_sub_epi8(vhi, xvhi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    
+    
+    
+    intvec operator~() const { return IV(~U(0)) ^ *this; }
+    
+    intvec operator&(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    intvec operator|(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+                                              _mm256_castsi256_ps(x.v)));
+    }
+    intvec operator^(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      uint_t masklo = U(0x00ffU) >> U(n);
+      uint_t maskhi = U(0xff00U);
+      __m128i mask = _mm_set1_epi16(masklo | maskhi);
+      vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask);
+      vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator>>(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      uint_t masklo = U(0x00ffU);
+      uint_t maskhi = U(0xff00U);
+      __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n+8),
+                                    _mm_set1_epi16(masklo));
+      __m128i vlohi = _mm_and_si128(_mm_srai_epi16(vlo, n),
+                                    _mm_set1_epi16(maskhi));
+      vlo = _mm_or_si128(vlolo, vlohi);
+      __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n+8),
+                                    _mm_set1_epi16(masklo));
+      __m128i vhihi = _mm_and_si128(_mm_srai_epi16(vhi, n),
+                                    _mm_set1_epi16(maskhi));
+      vhi = _mm_or_si128(vhilo, vhihi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator<<(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      uint_t masklo = U(0x00ffU);
+      uint_t maskhi = U(0xff00U) << U(n);
+      __m128i mask = _mm_set1_epi16(masklo | maskhi);
+      vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask);
+      vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      intvec r = *this;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, U(r[i]) >> U(n[i]));
+      }
+      return r;
+    }
+    intvec operator>>(intvec n) const
+    {
+      intvec r = *this;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, r[i] >> n[i]);
+      }
+      return r;
+    }
+    intvec operator<<(intvec n) const
+    {
+      intvec r = *this;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, r[i] << n[i]);
+      }
+      return r;
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t operator==(intvec const& x) const
+    {
+      return ! (*this != x);
+    }
+    boolvec_t operator!=(intvec const& x) const
+    {
+      return (*this ^ x).convert_bool();
+    }
+  };
+  
+  
+  
+  template<>
+  struct realvec<fp8,32>: floatprops<fp8>
+  {
+    static int const size = 32;
+    typedef real_t scalar_t;
+    typedef __m256i vector_t;
+    
+    static char const* name() { return "<AVX:32*fp8>"; }
+    inline void barrier() { asm("": "+x" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(_mm256_set1_epi8(FP::as_int(a))) {}
+    realvec(real_t const* as):
+    v(_mm256_set_epi8(FP::as_int(as[31]),
+                      FP::as_int(as[30]),
+                      FP::as_int(as[29]),
+                      FP::as_int(as[28]),
+                      FP::as_int(as[27]),
+                      FP::as_int(as[26]),
+                      FP::as_int(as[25]),
+                      FP::as_int(as[24]),
+                      FP::as_int(as[23]),
+                      FP::as_int(as[22]),
+                      FP::as_int(as[21]),
+                      FP::as_int(as[20]),
+                      FP::as_int(as[19]),
+                      FP::as_int(as[18]),
+                      FP::as_int(as[17]),
+                      FP::as_int(as[16]),
+                      FP::as_int(as[15]),
+                      FP::as_int(as[14]),
+                      FP::as_int(as[13]),
+                      FP::as_int(as[12]),
+                      FP::as_int(as[11]),
+                      FP::as_int(as[10]),
+                      FP::as_int(as[ 9]),
+                      FP::as_int(as[ 8]),
+                      FP::as_int(as[ 7]),
+                      FP::as_int(as[ 6]),
+                      FP::as_int(as[ 5]),
+                      FP::as_int(as[ 4]),
+                      FP::as_int(as[ 3]),
+                      FP::as_int(as[ 2]),
+                      FP::as_int(as[ 1]),
+                      FP::as_int(as[ 0]))) {}
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      return _mm256_load_si256((__m256i const*)p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return _mm256_loadu_si256((__m256i const*)p);
+    }
+    static realvec_t loadu(real_t const* p, size_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return loada(p);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return loada(p, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      _mm256_store_si256((__m256i*)p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      return _mm256_storeu_si256((__m256i*)p, v);
+    }
+    void storeu(real_t* p, size_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return storea(p);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+        // TODO: this is expensive
+        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        // TODO: this is expensive
+        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+      }
+    }
+    void storeu(real_t* p, size_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+      if (ioff==0) return storea(p, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return v; }
+    intvec_t convert_int() const { __builtin_unreachable(); }
+    
+    
+    
+    realvec operator+() const { __builtin_unreachable(); }
+    realvec operator-() const { __builtin_unreachable(); }
+    
+    realvec operator+(realvec x) const { __builtin_unreachable(); }
+    realvec operator-(realvec x) const { __builtin_unreachable(); }
+    realvec operator*(realvec x) const { __builtin_unreachable(); }
+    realvec operator/(realvec x) const { __builtin_unreachable(); }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const { __builtin_unreachable(); }
+    real_t sum() const { __builtin_unreachable(); }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
+    
+    
+    
+    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+    realvec fabs() const { return MF::vml_fabs(*this); }
+    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+    boolvec_t isinf() const { return MF::vml_isinf(*this); }
+    boolvec_t isnan() const { return MF::vml_isnan(*this); }
+    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+    boolvec_t signbit() const { return v; }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<fp8,32>::as_int() const -> intvec_t
+  {
+    return v;
+  }
+  
+  inline
+  auto boolvec<fp8,32>::convert_int() const -> intvec_t
+  {
+    return lsr(as_int(), bits-1);
+  }
+  
+  inline
+  auto boolvec<fp8,32>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return ifthen(x.as_float(), y.as_float()).as_int();
+  }
+  
+  inline
+  auto boolvec<fp8,32>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return (( -convert_int() & x.as_int()) |
+            (~-convert_int() & y.as_int())).as_float();
+  }
+
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<fp8,32>::as_float() const -> realvec_t
+  {
+    return v;
+  }
+  
+  inline auto intvec<fp8,32>::convert_float() const -> realvec_t
+  {
+    __builtin_unreachable();
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_FP8_AVX_H
diff --git a/vecmathlib.h b/vecmathlib.h
index 451a967..abfeda3 100644
--- a/vecmathlib.h
+++ b/vecmathlib.h
@@ -37,20 +37,16 @@
 // Intel SSE 2
 #if defined __SSE2__
 #  include "vec_float_sse2_scalar.h"
-#  include "vec_float_sse2.h"
-#endif
-// Intel AVX
-#if defined __AVX__
-#  include "vec_float_avx.h"
-#endif
-
-// Intel SSE 2
-#if defined __SSE2__
 #  include "vec_double_sse2_scalar.h"
+#  include "vec_float_sse2.h"
 #  include "vec_double_sse2.h"
 #endif
+
 // Intel AVX
 #if defined __AVX__
+#  include "vec_fp8_avx.h"
+#  include "vec_fp16_avx.h"
+#  include "vec_float_avx.h"
 #  include "vec_double_avx.h"
 #endif