From ab9a95904b3982859cf3a54e13a9896e7c719f5f Mon Sep 17 00:00:00 2001
From: Erik Schnetter <schnetter@gmail.com>
Date: Mon, 17 Jun 2013 14:37:44 -0400
Subject: Rename implementation files to indicate vector size

---
 vec_altivec_float4.h     | 553 +++++++++++++++++++++++++++++++++++++++
 vec_avx_double4.h        | 643 +++++++++++++++++++++++++++++++++++++++++++++
 vec_avx_float8.h         | 646 +++++++++++++++++++++++++++++++++++++++++++++
 vec_avx_fp16_16.h        | 582 +++++++++++++++++++++++++++++++++++++++++
 vec_avx_fp8_32.h         | 648 +++++++++++++++++++++++++++++++++++++++++++++
 vec_double_avx.h         | 643 ---------------------------------------------
 vec_double_qpx.h         | 667 -----------------------------------------------
 vec_double_sse2.h        | 646 ---------------------------------------------
 vec_double_sse2_scalar.h | 528 -------------------------------------
 vec_double_vsx.h         | 656 ----------------------------------------------
 vec_float_altivec.h      | 553 ---------------------------------------
 vec_float_avx.h          | 646 ---------------------------------------------
 vec_float_neon.h         | 558 ---------------------------------------
 vec_float_sse2.h         | 651 ---------------------------------------------
 vec_float_sse2_scalar.h  | 523 -------------------------------------
 vec_fp16_avx.h           | 582 -----------------------------------------
 vec_fp8_avx.h            | 648 ---------------------------------------------
 vec_neon_float2.h        | 558 +++++++++++++++++++++++++++++++++++++++
 vec_qpx_double4.h        | 667 +++++++++++++++++++++++++++++++++++++++++++++++
 vec_sse_double1.h        | 528 +++++++++++++++++++++++++++++++++++++
 vec_sse_double2.h        | 646 +++++++++++++++++++++++++++++++++++++++++++++
 vec_sse_float1.h         | 523 +++++++++++++++++++++++++++++++++++++
 vec_sse_float4.h         | 651 +++++++++++++++++++++++++++++++++++++++++++++
 vec_vsx_double2.h        | 656 ++++++++++++++++++++++++++++++++++++++++++++++
 vecmathlib.h             |  29 ++-
 25 files changed, 7317 insertions(+), 7314 deletions(-)
 create mode 100644 vec_altivec_float4.h
 create mode 100644 vec_avx_double4.h
 create mode 100644 vec_avx_float8.h
 create mode 100644 vec_avx_fp16_16.h
 create mode 100644 vec_avx_fp8_32.h
 delete mode 100644 vec_double_avx.h
 delete mode 100644 vec_double_qpx.h
 delete mode 100644 vec_double_sse2.h
 delete mode 100644 vec_double_sse2_scalar.h
 delete mode 100644 vec_double_vsx.h
 delete mode 100644 vec_float_altivec.h
 delete mode 100644 vec_float_avx.h
 delete mode 100644 vec_float_neon.h
 delete mode 100644 vec_float_sse2.h
 delete mode 100644 vec_float_sse2_scalar.h
 delete mode 100644 vec_fp16_avx.h
 delete mode 100644 vec_fp8_avx.h
 create mode 100644 vec_neon_float2.h
 create mode 100644 vec_qpx_double4.h
 create mode 100644 vec_sse_double1.h
 create mode 100644 vec_sse_double2.h
 create mode 100644 vec_sse_float1.h
 create mode 100644 vec_sse_float4.h
 create mode 100644 vec_vsx_double2.h
diff --git a/vec_altivec_float4.h b/vec_altivec_float4.h
new file mode 100644
index 0000000..813141e
--- /dev/null
+++ b/vec_altivec_float4.h
@@ -0,0 +1,553 @@
+// -*-C++-*-
+
+#ifndef VEC_ALTIVEC_FLOAT4_H
+#define VEC_ALTIVEC_FLOAT4_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// Altivec intrinsics
+#include <altivec.h>
+#undef vector
+#undef pixel
+#undef bool
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_FLOAT_4
+  template<> struct boolvec<float,4>;
+  template<> struct intvec<float,4>;
+  template<> struct realvec<float,4>;
+  
+  
+  
+  template<>
+  struct boolvec<float,4>: floatprops<float>
+  {
+    static int const size = 4;
+    typedef bool scalar_t;
+    typedef __vector __bool int bvector_t;
+    static int const alignment = sizeof(bvector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // true values are -1, false values are 0
+    static uint_t from_bool(bool a) { return -int_t(a); }
+    static bool to_bool(uint_t a) { return a; }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a): v(vec_splats(from_bool(a))) {}
+    boolvec(bool const* as)
+    {
+      for (int d=0; d<size; ++d) set_elt(d, as[d]);
+    }
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+    boolvec& set_elt(int n, bool a)
+    {
+      return ((uint_t*)&v)[n]=from_bool(a), *this;
+    }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return vec_nor(v, v); }
+    
+    boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
+    boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
+    // boolvec operator==(boolvec x) const { return !(*this!=x); }
+    boolvec operator==(boolvec x) const; // defined after intvec
+    boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
+    
+    bool all() const { return vec_all_ne(v, BV(false).v); }
+    bool any() const { return vec_any_ne(v, BV(false).v); }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<float,4>: floatprops<float>
+  {
+    static int const size = 4;
+    typedef int_t scalar_t;
+    typedef __vector int ivector_t;
+    static int const alignment = sizeof(ivector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(vec_splats(a)) {}
+    intvec(int_t const* as)
+    {
+      for (int d=0; d<size; ++d) set_elt(d, as[d]);
+    }
+    static intvec iota() { return (__vector int){0, 1, 2, 3}; }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+    
+    
+    
+    // Vector casts do not change the bit battern
+    boolvec_t as_bool() const { return (__vector __bool int)v; }
+    boolvec_t convert_bool() const { return *this != IV(0); }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    intvec operator+() const { return *this; }
+    intvec operator-() const { return IV(0) - *this; }
+    
+    intvec operator+(intvec x) const { return vec_add(v, x.v); }
+    intvec operator-(intvec x) const { return vec_sub(v, x.v); }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    
+    
+    
+    intvec operator~() const { return vec_nor(v, v); }
+    
+    intvec operator&(intvec x) const { return vec_and(v, x.v); }
+    intvec operator|(intvec x) const { return vec_or(v, x.v); }
+    intvec operator^(intvec x) const { return vec_xor(v, x.v); }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const { return lsr(IV(n)); }
+    intvec operator>>(int_t n) const { return *this >> IV(n); }
+    intvec operator<<(int_t n) const { return *this << IV(n); }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      return vec_sr(v, (__vector unsigned int)n.v);
+    }
+    intvec operator>>(intvec n) const
+    {
+      return vec_sra(v, (__vector unsigned int)n.v);
+    }
+    intvec operator<<(intvec n) const
+    {
+      return vec_sl(v, (__vector unsigned int)n.v);
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t signbit() const
+    {
+      return *this < IV(I(0));
+    }
+    
+    boolvec_t operator==(intvec const& x) const { return vec_cmpeq(v, x.v); }
+    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
+    boolvec_t operator<(intvec const& x) const { return vec_cmplt(v, x.v); }
+    boolvec_t operator<=(intvec const& x) const { return !(*this > x); }
+    boolvec_t operator>(intvec const& x) const { return vec_cmpgt(v, x.v); }
+    boolvec_t operator>=(intvec const& x) const { return !(*this < x); }
+  };
+  
+  
+  
+  template<>
+  struct realvec<float,4>: floatprops<float>
+  {
+    static int const size = 4;
+    typedef real_t scalar_t;
+    typedef __vector float vector_t;
+    static int const alignment = sizeof(vector_t);
+    
+    static char const* name() { return "<Altivec:4*float>"; }
+    void barrier() { __asm__("": "+v" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(vec_splats(a)) {}
+    realvec(real_t const* as)
+    {
+      for (int d=0; d<size; ++d) set_elt(d, as[d]);
+    }
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return vec_ld(0, p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      realvec_t v0 = vec_ld(0, p);
+      realvec_t v1 = vec_ld(15, p);
+      return vec_perm(v0.v, v1.v, vec_lvsl(0, p));
+    }
+    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      vec_st(v, 0, p);
+    }
+    void storeu(real_t* p) const
+    {
+      // Vector stores would require vector loads, which would need to
+      // be atomic
+      // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
+      p[0] = (*this)[0];
+      p[1] = (*this)[1];
+      p[2] = (*this)[2];
+      p[3] = (*this)[3];
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+	// Use vec_ste?
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+        if (m.m[2]) p[2] = (*this)[2];
+        if (m.m[3]) p[3] = (*this)[3];
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+	// Use vec_ste?
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+        if (m.m[2]) p[2] = (*this)[2];
+        if (m.m[3]) p[3] = (*this)[3];
+      }
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return (__vector int) v; }
+    intvec_t convert_int() const { return vec_cts(v, 0); }
+    
+    
+    
+    realvec operator+() const { return *this; }
+    realvec operator-() const { return RV(0.0) - *this; }
+    
+    realvec operator+(realvec x) const { return vec_add(v, x.v); }
+    realvec operator-(realvec x) const { return vec_sub(v, x.v); }
+    realvec operator*(realvec x) const {
+#if defined __VSX__
+      return vec_mul(v, x.v);
+#else
+      return vec_madd(v, x.v, RV(0.0).v);
+#endif
+    }
+    realvec operator/(realvec x) const {
+#if defined __VSX__
+      return vec_div(v, x.v);
+#else
+      return *this * x.rcp();
+#endif
+    }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const
+    {
+      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+    }
+    real_t sum() const
+    {
+      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+    }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); }
+    boolvec_t operator!=(realvec const& x) const { return ! (*this == x); }
+    boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); }
+    boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); }
+    boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); }
+    boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); }
+    
+    
+    
+    realvec acos() const { return MF::vml_acos(*this); }
+    realvec acosh() const { return MF::vml_acosh(*this); }
+    realvec asin() const { return MF::vml_asin(*this); }
+    realvec asinh() const { return MF::vml_asinh(*this); }
+    realvec atan() const { return MF::vml_atan(*this); }
+    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+    realvec atanh() const { return MF::vml_atanh(*this); }
+    realvec cbrt() const { return MF::vml_cbrt(*this); }
+    realvec ceil() const { return vec_ceil(v); }
+    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cosh() const { return MF::vml_cosh(*this); }
+    realvec exp() const { return MF::vml_exp(*this); }
+    realvec exp10() const { return MF::vml_exp10(*this); }
+    realvec exp2() const { return MF::vml_exp2(*this); }
+    realvec expm1() const { return MF::vml_expm1(*this); }
+    realvec fabs() const { return vec_abs(v); }
+    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+    realvec floor() const { return vec_floor(v); }
+    realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
+    realvec fmax(realvec y) const { return vec_max(v, y.v); }
+    realvec fmin(realvec y) const { return vec_min(v, y.v); }
+    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+    boolvec_t isinf() const { return MF::vml_isinf(*this); }
+    boolvec_t isnan() const { return MF::vml_isnan(*this); }
+    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec log() const { return MF::vml_log(*this); }
+    realvec log10() const { return MF::vml_log10(*this); }
+    realvec log1p() const { return MF::vml_log1p(*this); }
+    realvec log2() const { return MF::vml_log2(*this); }
+    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+    realvec rcp() const
+    {
+      realvec x = *this;
+      realvec r = vec_re(v);    // this is only an approximation
+      // TODO: use fma
+      // Note: don't rewrite this expression, this may introduce
+      // cancellation errors
+      r += r * (RV(1.0) - x*r); // one Newton iteration (see vml_rcp)
+      return r;
+    }
+    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+    realvec rint() const { return vec_round(v); }
+    realvec round() const { return MF::vml_round(*this); }
+    realvec rsqrt() const
+    {
+#if defined __VSX__
+      return vec_rsqrt(v);
+#else
+      realvec x = *this;
+      realvec r = vec_rsqrte(x.v); // this is only an approximation
+      // TODO: use fma
+      // one Newton iteration (see vml_rsqrt)
+      r += RV(0.5)*r * (RV(1.0) - x * r*r);
+      return r;
+#endif
+    }
+    boolvec_t signbit() const { return MF::vml_signbit(*this); }
+    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sinh() const { return MF::vml_sinh(*this); }
+    realvec sqrt() const {
+#if defined __VSX__
+      return vec_sqrt(v);
+#else
+      return *this * rsqrt();
+#endif
+    }
+    realvec tan() const { return MF::vml_tan(*this); }
+    realvec tanh() const { return MF::vml_tanh(*this); }
+    realvec trunc() const { return vec_trunc(v); }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<float,4>::as_int() const -> intvec_t
+  {
+    return (__vector int) v;
+  }
+  
+  inline
+  auto boolvec<float,4>::convert_int() const -> intvec_t
+  {
+    return -(__vector int)v;
+  }
+  
+  inline
+  auto boolvec<float,4>::operator==(boolvec x) const -> boolvec_t
+  {
+    return as_int() == x.as_int();
+  }
+  
+  inline
+  auto boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return vec_sel(y.v, x.v, v);
+  }
+  
+  inline
+  auto boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return vec_sel(y.v, x.v, v);
+  }
+  
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<float,4>::as_float() const -> realvec_t
+  {
+    return (__vector float)v;
+  }
+  
+  inline auto intvec<float,4>::convert_float() const -> realvec_t
+  {
+    return vec_ctf(v, 0);
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_ALTIVEC_FLOAT4_H
diff --git a/vec_avx_double4.h b/vec_avx_double4.h
new file mode 100644
index 0000000..15f7edf
--- /dev/null
+++ b/vec_avx_double4.h
@@ -0,0 +1,643 @@
+// -*-C++-*-
+
+#ifndef VEC_AVX_DOUBLE4_H
+#define VEC_AVX_DOUBLE4_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// AVX intrinsics
+#include <immintrin.h>
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_DOUBLE_4
+  template<> struct boolvec<double,4>;
+  template<> struct intvec<double,4>;
+  template<> struct realvec<double,4>;
+  
+  
+  
+  template<>
+  struct boolvec<double,4>: floatprops<double>
+  {
+    static int const size = 4;
+    typedef bool scalar_t;
+    typedef __m256d bvector_t;
+    static int const alignment = sizeof(bvector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // true values have the sign bit set, false values have it unset
+    static uint_t from_bool(bool a) { return - uint_t(a); }
+    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a):
+    v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {}
+    boolvec(bool const* as):
+    v(_mm256_castsi256_pd(_mm256_set_epi64x(from_bool(as[3]),
+                                            from_bool(as[2]),
+                                            from_bool(as[1]),
+                                            from_bool(as[0])))) {}
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+    boolvec& set_elt(int n, bool a)
+    {
+      return ((uint_t*)&v)[n]=from_bool(a), *this;
+    }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return _mm256_xor_pd(boolvec(true), v); }
+    
+    boolvec operator&&(boolvec x) const { return _mm256_and_pd(v, x.v); }
+    boolvec operator||(boolvec x) const { return _mm256_or_pd(v, x.v); }
+    boolvec operator==(boolvec x) const { return !(*this!=x); }
+    boolvec operator!=(boolvec x) const { return _mm256_xor_pd(v, x.v); }
+    
+    bool all() const
+    {
+      // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+      return ! (! *this).any();
+    }
+    bool any() const
+    {
+      // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+      return ! _mm256_testz_pd(v, v);
+    }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<double,4>: floatprops<double>
+  {
+    static int const size = 4;
+    typedef int_t scalar_t;
+    typedef __m256i ivector_t;
+    static int const alignment = sizeof(ivector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(_mm256_set1_epi64x(a)) {}
+    intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {}
+    static intvec iota() { return _mm256_set_epi64x(3, 2, 1, 0); }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+    
+    
+    
+    boolvec_t as_bool() const { return _mm256_castsi256_pd(v); }
+    boolvec_t convert_bool() const
+    {
+      // Result: convert_bool(0)=false, convert_bool(else)=true
+      // There is no intrinsic to compare with zero. Instead, we check
+      // whether x is positive and x-1 is negative.
+      intvec x = *this;
+      // We know that boolvec values depend only on the sign bit
+      // return (~(x-1) | x).as_bool();
+      // return x.as_bool() || !(x-1).as_bool();
+      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+    }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    // Note: not all arithmetic operations are supported!
+    
+    intvec operator+() const { return *this; }
+    intvec operator-() const { return IV(I(0)) - *this; }
+    
+    intvec operator+(intvec x) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      __m128i xvlo = _mm256_castsi256_si128(x.v);
+      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+      vlo = _mm_add_epi64(vlo, xvlo);
+      vhi = _mm_add_epi64(vhi, xvhi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator-(intvec x) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      __m128i xvlo = _mm256_castsi256_si128(x.v);
+      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+      vlo = _mm_sub_epi64(vlo, xvlo);
+      vhi = _mm_sub_epi64(vhi, xvhi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    
+    
+    
+    intvec operator~() const { return IV(~U(0)) ^ *this; }
+    
+    intvec operator&(intvec x) const
+    {
+      return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(v),
+                                               _mm256_castsi256_pd(x.v)));
+    }
+    intvec operator|(intvec x) const
+    {
+      return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(v),
+                                              _mm256_castsi256_pd(x.v)));
+    }
+    intvec operator^(intvec x) const
+    {
+      return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(v),
+                                               _mm256_castsi256_pd(x.v)));
+    }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      vlo = _mm_srli_epi64(vlo, n);
+      vhi = _mm_srli_epi64(vhi, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator>>(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      // There is no _mm_srai_epi64. To emulate it, add 0x80000000
+      // before shifting, and subtract the shifted 0x80000000 after
+      // shifting
+#if 0
+      __m128i signmask01 = _mm_sub_epi64(_mm_set1_epi64x(0),
+                                         _mm_srli_epi64(vlo, 63));
+      __m128i signmask23 = _mm_sub_epi64(_mm_set1_epi64x(0),
+                                         _mm_srli_epi64(vhi, 63));
+      vlo = _mm_xor_si128(signmask01, vlo);
+      vhi = _mm_xor_si128(signmask23, vhi);
+      vlo = _mm_srli_epi64(vlo, n);
+      vhi = _mm_srli_epi64(vhi, n);
+      vlo = _mm_xor_si128(signmask01, vlo);
+      vhi = _mm_xor_si128(signmask23, vhi);
+#else
+      // Convert signed to unsiged
+      vlo = _mm_add_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1)));
+      vhi = _mm_add_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1)));
+      // Shift
+      vlo = _mm_srli_epi64(vlo, n);
+      vhi = _mm_srli_epi64(vhi, n);
+      // Undo conversion
+      vlo = _mm_sub_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1-n)));
+      vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1-n)));
+#endif
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator<<(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      vlo = _mm_slli_epi64(vlo, n);
+      vhi = _mm_slli_epi64(vhi, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, U((*this)[i]) >> U(n[i]));
+      }
+      return r;
+    }
+    intvec operator>>(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] >> n[i]);
+      }
+      return r;
+    }
+    intvec operator<<(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] << n[i]);
+      }
+      return r;
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t signbit() const
+    {
+      return as_bool();
+    }
+    
+    boolvec_t operator==(intvec const& x) const
+    {
+      return ! (*this != x);
+    }
+    boolvec_t operator!=(intvec const& x) const
+    {
+      return (*this ^ x).convert_bool();
+    }
+    boolvec_t operator<(intvec const& x) const
+    {
+      // return (*this - x).as_bool();
+      boolvec_t r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] < x[i]);
+      }
+      return r;
+    }
+    boolvec_t operator<=(intvec const& x) const
+    {
+      return ! (*this > x);
+    }
+    boolvec_t operator>(intvec const& x) const
+    {
+      return x < *this;
+    }
+    boolvec_t operator>=(intvec const& x) const
+    {
+      return ! (*this < x);
+    }
+  };
+  
+  
+  
+  template<>
+  struct realvec<double,4>: floatprops<double>
+  {
+    static int const size = 4;
+    typedef real_t scalar_t;
+    typedef __m256d vector_t;
+    static int const alignment = sizeof(vector_t);
+    
+    static char const* name() { return "<AVX:4*double>"; }
+    void barrier() { __asm__("": "+x" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(_mm256_set1_pd(a)) {}
+    realvec(real_t const* as): v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {}
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return _mm256_load_pd(p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return _mm256_loadu_pd(p);
+    }
+    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      _mm256_store_pd(p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      return _mm256_storeu_pd(p, v);
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+        _mm256_maskstore_pd(p, m.m.as_int(), v);
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+        if (m.m[2]) p[2] = (*this)[2];
+        if (m.m[3]) p[3] = (*this)[3];
+      }
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return _mm256_castpd_si256(v); }
+    intvec_t convert_int() const { return MF::vml_convert_int(*this); }
+    
+    
+    
+    realvec operator+() const { return *this; }
+    realvec operator-() const { return RV(0.0) - *this; }
+    
+    realvec operator+(realvec x) const { return _mm256_add_pd(v, x.v); }
+    realvec operator-(realvec x) const { return _mm256_sub_pd(v, x.v); }
+    realvec operator*(realvec x) const { return _mm256_mul_pd(v, x.v); }
+    realvec operator/(realvec x) const { return _mm256_div_pd(v, x.v); }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const
+    {
+      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+    }
+    real_t sum() const
+    {
+      // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+      // __m256d x = _mm256_hadd_pd(v, v);
+      // __m128d xlo = _mm256_extractf128_pd(x, 0);
+      // __m128d xhi = _mm256_extractf128_pd(x, 1);
+      realvec x = *this;
+      x = _mm256_hadd_pd(x.v, x.v);
+      return x[0] + x[2];
+    }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const
+    {
+      return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ);
+    }
+    boolvec_t operator!=(realvec const& x) const
+    {
+      return _mm256_cmp_pd(v, x.v, _CMP_NEQ_OQ);
+    }
+    boolvec_t operator<(realvec const& x) const
+    {
+      return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ);
+    }
+    boolvec_t operator<=(realvec const& x) const
+    {
+      return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ);
+    }
+    boolvec_t operator>(realvec const& x) const
+    {
+      return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ);
+    }
+    boolvec_t operator>=(realvec const& x) const
+    {
+      return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ);
+    }
+    
+    
+    
+    realvec acos() const { return MF::vml_acos(*this); }
+    realvec acosh() const { return MF::vml_acosh(*this); }
+    realvec asin() const { return MF::vml_asin(*this); }
+    realvec asinh() const { return MF::vml_asinh(*this); }
+    realvec atan() const { return MF::vml_atan(*this); }
+    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+    realvec atanh() const { return MF::vml_atanh(*this); }
+    realvec cbrt() const { return MF::vml_cbrt(*this); }
+    realvec ceil() const { return _mm256_ceil_pd(v); }
+    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cosh() const { return MF::vml_cosh(*this); }
+    realvec exp() const { return MF::vml_exp(*this); }
+    realvec exp10() const { return MF::vml_exp10(*this); }
+    realvec exp2() const { return MF::vml_exp2(*this); }
+    realvec expm1() const { return MF::vml_expm1(*this); }
+    realvec fabs() const { return MF::vml_fabs(*this); }
+    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+    realvec floor() const { return _mm256_floor_pd(v); }
+    realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); }
+    realvec fmax(realvec y) const { return _mm256_max_pd(v, y.v); }
+    realvec fmin(realvec y) const { return _mm256_min_pd(v, y.v); }
+    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+    boolvec_t isinf() const { return MF::vml_isinf(*this); }
+    boolvec_t isnan() const { return _mm256_cmp_pd(v, v, _CMP_UNORD_Q); }
+    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec log() const { return MF::vml_log(*this); }
+    realvec log10() const { return MF::vml_log10(*this); }
+    realvec log1p() const { return MF::vml_log1p(*this); }
+    realvec log2() const { return MF::vml_log2(*this); }
+    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+    realvec rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); }
+    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+    realvec rint() const
+    {
+      return _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
+    }
+    realvec round() const { return MF::vml_round(*this); }
+    realvec rsqrt() const { return MF::vml_rsqrt(*this); }
+    boolvec_t signbit() const { return v; }
+    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sinh() const { return MF::vml_sinh(*this); }
+    realvec sqrt() const { return _mm256_sqrt_pd(v); }
+    realvec tan() const { return MF::vml_tan(*this); }
+    realvec tanh() const { return MF::vml_tanh(*this); }
+    realvec trunc() const { return _mm256_round_pd(v, _MM_FROUND_TO_ZERO); }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<double,4>::as_int() const -> intvec_t
+  {
+    return _mm256_castpd_si256(v);
+  }
+  
+  inline
+  auto boolvec<double,4>::convert_int() const -> intvec_t
+  {
+    //return ifthen(v, U(1), U(0));
+    return lsr(as_int(), bits-1);
+  }
+  
+  inline
+  auto boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return ifthen(x.as_float(), y.as_float()).as_int();
+  }
+  
+  inline
+  auto boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return _mm256_blendv_pd(y.v, x.v, v);
+  }
+
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<double,4>::as_float() const -> realvec_t
+  {
+    return _mm256_castsi256_pd(v);
+  }
+  
+  inline auto intvec<double,4>::convert_float() const -> realvec_t
+  {
+    return MF::vml_convert_float(*this);
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_AVX_DOUBLE4_H
diff --git a/vec_avx_float8.h b/vec_avx_float8.h
new file mode 100644
index 0000000..e7e1187
--- /dev/null
+++ b/vec_avx_float8.h
@@ -0,0 +1,646 @@
+// -*-C++-*-
+
+#ifndef VEC_AVX_FLOAT8_H
+#define VEC_AVX_FLOAT8_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// AVX intrinsics
+#include <immintrin.h>
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_FLOAT_8
+  template<> struct boolvec<float,8>;
+  template<> struct intvec<float,8>;
+  template<> struct realvec<float,8>;
+  
+  
+  
+  template<>
+  struct boolvec<float,8>: floatprops<float>
+  {
+    static int const size = 8;
+    typedef bool scalar_t;
+    typedef __m256 bvector_t;
+    static int const alignment = sizeof(bvector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // true values have the sign bit set, false values have it unset
+    static uint_t from_bool(bool a) { return - uint_t(a); }
+    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a):
+    v(_mm256_castsi256_ps(_mm256_set1_epi32(from_bool(a)))) {}
+    boolvec(bool const* as):
+    v(_mm256_castsi256_ps(_mm256_set_epi32(from_bool(as[7]),
+                                           from_bool(as[6]),
+                                           from_bool(as[5]),
+                                           from_bool(as[4]),
+                                           from_bool(as[3]),
+                                           from_bool(as[2]),
+                                           from_bool(as[1]),
+                                           from_bool(as[0])))) {}
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+    boolvec& set_elt(int n, bool a)
+    {
+      return ((uint_t*)&v)[n]=from_bool(a), *this;
+    }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return _mm256_xor_ps(boolvec(true), v); }
+    
+    boolvec operator&&(boolvec x) const { return _mm256_and_ps(v, x.v); }
+    boolvec operator||(boolvec x) const { return _mm256_or_ps(v, x.v); }
+    boolvec operator==(boolvec x) const { return !(*this!=x); }
+    boolvec operator!=(boolvec x) const { return _mm256_xor_ps(v, x.v); }
+    
+    bool all() const
+    {
+      // return
+      //   (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] &&
+      //   (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7];
+      return ! (! *this).any();
+    }
+    bool any() const
+    {
+      // return
+      //   (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] ||
+      //   (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7];
+      return ! _mm256_testz_ps(v, v);
+    }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<float,8>: floatprops<float>
+  {
+    static int const size = 8;
+    typedef int_t scalar_t;
+    typedef __m256i ivector_t;
+    static int const alignment = sizeof(ivector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(_mm256_set1_epi32(a)) {}
+    intvec(int_t const* as): v(_mm256_set_epi32(as[7], as[6], as[5], as[4],
+                                                as[3], as[2], as[1], as[0])) {}
+    static intvec iota() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+    
+    
+    
+    boolvec_t as_bool() const { return _mm256_castsi256_ps(v); }
+    boolvec_t convert_bool() const
+    {
+      // Result: convert_bool(0)=false, convert_bool(else)=true
+      // There is no intrinsic to compare with zero. Instead, we check
+      // whether x is positive and x-1 is negative.
+      intvec x = *this;
+      // We know that boolvec values depend only on the sign bit
+      // return (~(x-1) | x).as_bool();
+      // return x.as_bool() || !(x-1).as_bool();
+      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+    }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    // Note: not all arithmetic operations are supported!
+    
+    intvec operator+() const { return *this; }
+    intvec operator-() const { return IV(0) - *this; }
+    
+    intvec operator+(intvec x) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      __m128i xvlo = _mm256_castsi256_si128(x.v);
+      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+      vlo = _mm_add_epi32(vlo, xvlo);
+      vhi = _mm_add_epi32(vhi, xvhi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator-(intvec x) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      __m128i xvlo = _mm256_castsi256_si128(x.v);
+      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+      vlo = _mm_sub_epi32(vlo, xvlo);
+      vhi = _mm_sub_epi32(vhi, xvhi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    
+    
+    
+    intvec operator~() const { return IV(~U(0)) ^ *this; }
+    
+    intvec operator&(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    intvec operator|(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+                                              _mm256_castsi256_ps(x.v)));
+    }
+    intvec operator^(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      vlo = _mm_srli_epi32(vlo, n);
+      vhi = _mm_srli_epi32(vhi, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator>>(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      vlo = _mm_srai_epi32(vlo, n);
+      vhi = _mm_srai_epi32(vhi, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator<<(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      vlo = _mm_slli_epi32(vlo, n);
+      vhi = _mm_slli_epi32(vhi, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, U((*this)[i]) >> U(n[i]));
+      }
+      return r;
+    }
+    intvec operator>>(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] >> n[i]);
+      }
+      return r;
+    }
+    intvec operator<<(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] << n[i]);
+      }
+      return r;
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t signbit() const
+    {
+      return as_bool();
+    }
+    
+    boolvec_t operator==(intvec const& x) const
+    {
+      return ! (*this != x);
+    }
+    boolvec_t operator!=(intvec const& x) const
+    {
+      return (*this ^ x).convert_bool();
+    }
+    boolvec_t operator<(intvec const& x) const
+    {
+      // return (*this - x).as_bool();
+      boolvec_t r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] < x[i]);
+      }
+      return r;
+    }
+    boolvec_t operator<=(intvec const& x) const
+    {
+      return ! (*this > x);
+    }
+    boolvec_t operator>(intvec const& x) const
+    {
+      return x < *this;
+    }
+    boolvec_t operator>=(intvec const& x) const
+    {
+      return ! (*this < x);
+    }
+  };
+  
+  
+  
+  template<>
+  struct realvec<float,8>: floatprops<float>
+  {
+    static int const size = 8;
+    typedef real_t scalar_t;
+    typedef __m256 vector_t;
+    static int const alignment = sizeof(vector_t);
+    
+    static char const* name() { return "<AVX:8*float>"; }
+    void barrier() { __asm__("": "+x" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(_mm256_set1_ps(a)) {}
+    realvec(real_t const* as): v(_mm256_set_ps(as[7], as[6], as[5], as[4],
+                                               as[3], as[2], as[1], as[0])) {}
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return _mm256_load_ps(p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return _mm256_loadu_ps(p);
+    }
+    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      _mm256_store_ps(p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      return _mm256_storeu_ps(p, v);
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+        _mm256_maskstore_ps(p, m.m.as_int(), v);
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        // TODO: this is expensive
+        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+      }
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return _mm256_castps_si256(v); }
+    intvec_t convert_int() const { return _mm256_cvttps_epi32(v); }
+    
+    
+    
+    realvec operator+() const { return *this; }
+    realvec operator-() const { return RV(0.0) - *this; }
+    
+    realvec operator+(realvec x) const { return _mm256_add_ps(v, x.v); }
+    realvec operator-(realvec x) const { return _mm256_sub_ps(v, x.v); }
+    realvec operator*(realvec x) const { return _mm256_mul_ps(v, x.v); }
+    realvec operator/(realvec x) const { return _mm256_div_ps(v, x.v); }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const
+    {
+      return
+        (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] *
+        (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7];
+    }
+    real_t sum() const
+    {
+      // return
+      //   (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3] +
+      //   (*this)[4] + (*this)[5] + (*this)[6] + (*this)[7];
+      // _m256 x = vhaddps(v, v);
+      // x = vhaddps(x, x);
+      // __m128 xlo = _mm256_extractf128_ps(x, 0);
+      // __m128 xhi = _mm256_extractf128_ps(x, 1);
+      // return _mm_cvtsd_f64(xlo) + _mm_cvtsd_f64(xhi);
+      realvec x = *this;
+      x = _mm256_hadd_ps(x.v, x.v);
+      x = _mm256_hadd_ps(x.v, x.v);
+      return x[0] + x[4];
+    }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const
+    {
+      return _mm256_cmp_ps(v, x.v, _CMP_EQ_OQ);
+    }
+    boolvec_t operator!=(realvec const& x) const
+    {
+      return _mm256_cmp_ps(v, x.v, _CMP_NEQ_OQ);
+    }
+    boolvec_t operator<(realvec const& x) const
+    {
+      return _mm256_cmp_ps(v, x.v, _CMP_LT_OQ);
+    }
+    boolvec_t operator<=(realvec const& x) const
+    {
+      return _mm256_cmp_ps(v, x.v, _CMP_LE_OQ);
+    }
+    boolvec_t operator>(realvec const& x) const
+    {
+      return _mm256_cmp_ps(v, x.v, _CMP_GT_OQ);
+    }
+    boolvec_t operator>=(realvec const& x) const
+    {
+      return _mm256_cmp_ps(v, x.v, _CMP_GE_OQ);
+    }
+    
+    
+    
+    realvec acos() const { return MF::vml_acos(*this); }
+    realvec acosh() const { return MF::vml_acosh(*this); }
+    realvec asin() const { return MF::vml_asin(*this); }
+    realvec asinh() const { return MF::vml_asinh(*this); }
+    realvec atan() const { return MF::vml_atan(*this); }
+    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+    realvec atanh() const { return MF::vml_atanh(*this); }
+    realvec cbrt() const { return MF::vml_cbrt(*this); }
+    realvec ceil() const { return _mm256_ceil_ps(v); }
+    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cosh() const { return MF::vml_cosh(*this); }
+    realvec exp() const { return MF::vml_exp(*this); }
+    realvec exp10() const { return MF::vml_exp10(*this); }
+    realvec exp2() const { return MF::vml_exp2(*this); }
+    realvec expm1() const { return MF::vml_expm1(*this); }
+    realvec fabs() const { return MF::vml_fabs(*this); }
+    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+    realvec floor() const { return _mm256_floor_ps(v); }
+    realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); }
+    realvec fmax(realvec y) const { return _mm256_max_ps(v, y.v); }
+    realvec fmin(realvec y) const { return _mm256_min_ps(v, y.v); }
+    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+    boolvec_t isinf() const { return MF::vml_isinf(*this); }
+    boolvec_t isnan() const { return _mm256_cmp_ps(v, v, _CMP_UNORD_Q); }
+    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec log() const { return MF::vml_log(*this); }
+    realvec log10() const { return MF::vml_log10(*this); }
+    realvec log1p() const { return MF::vml_log1p(*this); }
+    realvec log2() const { return MF::vml_log2(*this); }
+    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+    realvec rcp() const
+    {
+      realvec x = *this;
+      realvec r = _mm256_rcp_ps(x); // this is only an approximation
+      r *= RV(2.0) - r*x;        // one Newton iteration (see vml_rcp)
+      return r;
+    }
+    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+    realvec rint() const
+    {
+      return _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
+    }
+    realvec round() const { return MF::vml_round(*this); }
+    realvec rsqrt() const
+    {
+      realvec x = *this;
+      realvec r = _mm256_rsqrt_ps(x);    // this is only an approximation
+      r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt)
+      return r;
+    }
+    boolvec_t signbit() const { return v; }
+    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sinh() const { return MF::vml_sinh(*this); }
+    realvec sqrt() const { return _mm256_sqrt_ps(v); }
+    realvec tan() const { return MF::vml_tan(*this); }
+    realvec tanh() const { return MF::vml_tanh(*this); }
+    realvec trunc() const { return _mm256_round_ps(v, _MM_FROUND_TO_ZERO); }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<float,8>::as_int() const -> intvec_t
+  {
+    return _mm256_castps_si256(v);
+  }
+  
+  inline
+  auto boolvec<float,8>::convert_int() const -> intvec_t
+  {
+    return lsr(as_int(), bits-1);
+  }
+  
+  inline
+  auto boolvec<float,8>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return ifthen(x.as_float(), y.as_float()).as_int();
+  }
+  
+  inline
+  auto boolvec<float,8>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return _mm256_blendv_ps(y.v, x.v, v);
+  }
+
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<float,8>::as_float() const -> realvec_t
+  {
+    return _mm256_castsi256_ps(v);
+  }
+  
+  inline auto intvec<float,8>::convert_float() const -> realvec_t
+  {
+    return _mm256_cvtepi32_ps(v);
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_AVX_FLOAT8_H
diff --git a/vec_avx_fp16_16.h b/vec_avx_fp16_16.h
new file mode 100644
index 0000000..7cb98b2
--- /dev/null
+++ b/vec_avx_fp16_16.h
@@ -0,0 +1,582 @@
+// -*-C++-*-
+
+#ifndef VEC_AVX_FP16_16_H
+#define VEC_AVX_FP16_16_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// AVX intrinsics
+#include <immintrin.h>
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_FP16_16
+  template<> struct boolvec<fp16,16>;
+  template<> struct intvec<fp16,16>;
+  template<> struct realvec<fp16,16>;
+  
+  
+  
+  template<>
+  struct boolvec<fp16,16>: floatprops<fp16>
+  {
+    static int const size = 16;
+    typedef bool scalar_t;
+    typedef __m256i bvector_t;
+    static int const alignment = sizeof(bvector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // true values have the sign bit set, false values have it unset
+    static uint_t from_bool(bool a) { return - uint_t(a); }
+    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a): v(_mm256_set1_epi16(from_bool(a))) {}
+    boolvec(bool const* as):
+    v(_mm256_set_epi16(from_bool(as[15]),
+                       from_bool(as[14]),
+                       from_bool(as[13]),
+                       from_bool(as[12]),
+                       from_bool(as[11]),
+                       from_bool(as[10]),
+                       from_bool(as[ 9]),
+                       from_bool(as[ 8]),
+                       from_bool(as[ 7]),
+                       from_bool(as[ 6]),
+                       from_bool(as[ 5]),
+                       from_bool(as[ 4]),
+                       from_bool(as[ 3]),
+                       from_bool(as[ 2]),
+                       from_bool(as[ 1]),
+                       from_bool(as[ 0]))) {}
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+    boolvec& set_elt(int n, bool a)
+    {
+      return ((uint_t*)&v)[n]=from_bool(a), *this;
+    }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return *this != boolvec(true); }
+    
+    boolvec operator&&(boolvec x) const 
+    {
+      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    boolvec operator||(boolvec x) const
+    {
+      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+                                              _mm256_castsi256_ps(x.v)));
+    }
+    boolvec operator==(boolvec x) const { return !(*this!=x); }
+    boolvec operator!=(boolvec x) const
+    {
+      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    
+    bool all() const
+    {
+      bool r = true;
+      for (int n=0; n<size; ++n) r = r && (*this)[n];
+      return r;
+    }
+    bool any() const
+    {
+      bool r = false;
+      for (int n=0; n<size; ++n) r = r || (*this)[n];
+      return r;
+    }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<fp16,16>: floatprops<fp16>
+  {
+    static int const size = 16;
+    typedef int_t scalar_t;
+    typedef __m256i ivector_t;
+    static int const alignment = sizeof(ivector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(_mm256_set1_epi16(a)) {}
+    intvec(int_t const* as):
+    v(_mm256_set_epi16(as[15],
+                       as[14],
+                       as[13],
+                       as[12],
+                       as[11],
+                       as[10],
+                       as[ 9],
+                       as[ 8],
+                       as[ 7],
+                       as[ 6],
+                       as[ 5],
+                       as[ 4],
+                       as[ 3],
+                       as[ 2],
+                       as[ 1],
+                       as[ 0])) {}
+    static intvec iota()
+    {
+      return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8,
+                              7, 6, 5, 4, 3, 2, 1, 0);
+    }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+    
+    
+    
+    boolvec_t as_bool() const { return v; }
+    boolvec_t convert_bool() const
+    {
+      // Result: convert_bool(0)=false, convert_bool(else)=true
+      // There is no intrinsic to compare with zero. Instead, we check
+      // whether x is positive and x-1 is negative.
+      intvec x = *this;
+      // We know that boolvec values depend only on the sign bit
+      // return (~(x-1) | x).as_bool();
+      // return x.as_bool() || !(x-1).as_bool();
+      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+    }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    // Note: not all arithmetic operations are supported!
+    
+    intvec operator+() const { return *this; }
+    intvec operator-() const { return IV(I(0)) - *this; }
+    
+    intvec operator+(intvec x) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      __m128i xvlo = _mm256_castsi256_si128(x.v);
+      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+      vlo = _mm_add_epi16(vlo, xvlo);
+      vhi = _mm_add_epi16(vhi, xvhi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator-(intvec x) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      __m128i xvlo = _mm256_castsi256_si128(x.v);
+      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+      vlo = _mm_sub_epi16(vlo, xvlo);
+      vhi = _mm_sub_epi16(vhi, xvhi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    
+    
+    
+    intvec operator~() const { return IV(~U(0)) ^ *this; }
+    
+    intvec operator&(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    intvec operator|(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+                                              _mm256_castsi256_ps(x.v)));
+    }
+    intvec operator^(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      vlo = _mm_srli_epi16(vlo, n);
+      vhi = _mm_srli_epi16(vhi, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator>>(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      vlo = _mm_srai_epi16(vlo, n);
+      vhi = _mm_srai_epi16(vhi, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator<<(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      vlo = _mm_slli_epi16(vlo, n);
+      vhi = _mm_slli_epi16(vhi, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, U((*this)[i]) >> U(n[i]));
+      }
+      return r;
+    }
+    intvec operator>>(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] >> n[i]);
+      }
+      return r;
+    }
+    intvec operator<<(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] << n[i]);
+      }
+      return r;
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t operator==(intvec const& x) const
+    {
+      return ! (*this != x);
+    }
+    boolvec_t operator!=(intvec const& x) const
+    {
+      return (*this ^ x).convert_bool();
+    }
+  };
+  
+  
+  
+  template<>
+  struct realvec<fp16,16>: floatprops<fp16>
+  {
+    static int const size = 16;
+    typedef real_t scalar_t;
+    typedef __m256i vector_t;
+    static int const alignment = sizeof(vector_t);
+    
+    static char const* name() { return "<AVX:16*fp16>"; }
+    void barrier() { __asm__("": "+x" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(_mm256_set1_epi16(FP::as_int(a))) {}
+    realvec(real_t const* as):
+    v(_mm256_set_epi16(FP::as_int(as[15]),
+                       FP::as_int(as[14]),
+                       FP::as_int(as[13]),
+                       FP::as_int(as[12]),
+                       FP::as_int(as[11]),
+                       FP::as_int(as[10]),
+                       FP::as_int(as[ 9]),
+                       FP::as_int(as[ 8]),
+                       FP::as_int(as[ 7]),
+                       FP::as_int(as[ 6]),
+                       FP::as_int(as[ 5]),
+                       FP::as_int(as[ 4]),
+                       FP::as_int(as[ 3]),
+                       FP::as_int(as[ 2]),
+                       FP::as_int(as[ 1]),
+                       FP::as_int(as[ 0]))) {}
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return _mm256_load_si256((__m256i const*)p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return _mm256_loadu_si256((__m256i const*)p);
+    }
+    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      _mm256_store_si256((__m256i*)p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      return _mm256_storeu_si256((__m256i*)p, v);
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+        // TODO: this is expensive
+        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        // TODO: this is expensive
+        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+      }
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return v; }
+    intvec_t convert_int() const { __builtin_unreachable(); }
+    
+    
+    
+    realvec operator+() const { __builtin_unreachable(); }
+    realvec operator-() const { __builtin_unreachable(); }
+    
+    realvec operator+(realvec x) const { __builtin_unreachable(); }
+    realvec operator-(realvec x) const { __builtin_unreachable(); }
+    realvec operator*(realvec x) const { __builtin_unreachable(); }
+    realvec operator/(realvec x) const { __builtin_unreachable(); }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const { __builtin_unreachable(); }
+    real_t sum() const { __builtin_unreachable(); }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
+    
+    
+    
+    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+    realvec fabs() const { return MF::vml_fabs(*this); }
+    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+    boolvec_t isinf() const { return MF::vml_isinf(*this); }
+    boolvec_t isnan() const { return MF::vml_isnan(*this); }
+    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+    boolvec_t signbit() const { return v; }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<fp16,16>::as_int() const -> intvec_t
+  {
+    return v;
+  }
+  
+  inline
+  auto boolvec<fp16,16>::convert_int() const -> intvec_t
+  {
+    return lsr(as_int(), bits-1);
+  }
+  
+  inline
+  auto boolvec<fp16,16>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return ifthen(x.as_float(), y.as_float()).as_int();
+  }
+  
+  inline
+  auto boolvec<fp16,16>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return (( -convert_int() & x.as_int()) |
+            (~-convert_int() & y.as_int())).as_float();
+  }
+
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<fp16,16>::as_float() const -> realvec_t
+  {
+    return v;
+  }
+  
+  inline auto intvec<fp16,16>::convert_float() const -> realvec_t
+  {
+    __builtin_unreachable();
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_AVX_FP16_16_H
diff --git a/vec_avx_fp8_32.h b/vec_avx_fp8_32.h
new file mode 100644
index 0000000..a6e33a1
--- /dev/null
+++ b/vec_avx_fp8_32.h
@@ -0,0 +1,648 @@
+// -*-C++-*-
+
+#ifndef VEC_AVX_FP8_32_H
+#define VEC_AVX_FP8_32_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// AVX intrinsics
+#include <immintrin.h>
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_FP8_32
+  template<> struct boolvec<fp8,32>;
+  template<> struct intvec<fp8,32>;
+  template<> struct realvec<fp8,32>;
+  
+  
+  
+  template<>
+  struct boolvec<fp8,32>: floatprops<fp8>
+  {
+    static int const size = 32;
+    typedef bool scalar_t;
+    typedef __m256i bvector_t;
+    static int const alignment = sizeof(bvector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // true values have the sign bit set, false values have it unset
+    static uint_t from_bool(bool a) { return - uint_t(a); }
+    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a): v(_mm256_set1_epi8(from_bool(a))) {}
+    boolvec(bool const* as):
+    v(_mm256_set_epi8(from_bool(as[31]),
+                      from_bool(as[30]),
+                      from_bool(as[29]),
+                      from_bool(as[28]),
+                      from_bool(as[27]),
+                      from_bool(as[26]),
+                      from_bool(as[25]),
+                      from_bool(as[24]),
+                      from_bool(as[23]),
+                      from_bool(as[22]),
+                      from_bool(as[21]),
+                      from_bool(as[20]),
+                      from_bool(as[19]),
+                      from_bool(as[18]),
+                      from_bool(as[17]),
+                      from_bool(as[16]),
+                      from_bool(as[15]),
+                      from_bool(as[14]),
+                      from_bool(as[13]),
+                      from_bool(as[12]),
+                      from_bool(as[11]),
+                      from_bool(as[10]),
+                      from_bool(as[ 9]),
+                      from_bool(as[ 8]),
+                      from_bool(as[ 7]),
+                      from_bool(as[ 6]),
+                      from_bool(as[ 5]),
+                      from_bool(as[ 4]),
+                      from_bool(as[ 3]),
+                      from_bool(as[ 2]),
+                      from_bool(as[ 1]),
+                      from_bool(as[ 0]))) {}
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+    boolvec& set_elt(int n, bool a)
+    {
+      return ((uint_t*)&v)[n]=from_bool(a), *this;
+    }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return *this != boolvec(true); }
+    
+    boolvec operator&&(boolvec x) const 
+    {
+      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    boolvec operator||(boolvec x) const
+    {
+      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+                                              _mm256_castsi256_ps(x.v)));
+    }
+    boolvec operator==(boolvec x) const { return !(*this!=x); }
+    boolvec operator!=(boolvec x) const
+    {
+      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    
+    bool all() const
+    {
+      bool r = true;
+      for (int n=0; n<size; ++n) r = r && (*this)[n];
+      return r;
+    }
+    bool any() const
+    {
+      bool r = false;
+      for (int n=0; n<size; ++n) r = r || (*this)[n];
+      return r;
+    }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<fp8,32>: floatprops<fp8>
+  {
+    static int const size = 32;
+    typedef int_t scalar_t;
+    typedef __m256i ivector_t;
+    static int const alignment = sizeof(ivector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(_mm256_set1_epi8(a)) {}
+    intvec(int_t const* as):
+    v(_mm256_set_epi8(as[31],
+                      as[30],
+                      as[29],
+                      as[28],
+                      as[27],
+                      as[26],
+                      as[25],
+                      as[24],
+                      as[23],
+                      as[22],
+                      as[21],
+                      as[20],
+                      as[19],
+                      as[18],
+                      as[17],
+                      as[16],
+                      as[15],
+                      as[14],
+                      as[13],
+                      as[12],
+                      as[11],
+                      as[10],
+                      as[ 9],
+                      as[ 8],
+                      as[ 7],
+                      as[ 6],
+                      as[ 5],
+                      as[ 4],
+                      as[ 3],
+                      as[ 2],
+                      as[ 1],
+                      as[ 0])) {}
+    static intvec iota()
+    {
+      return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24,
+                             23, 22, 21, 20, 19, 18, 17, 16,
+                             15, 14, 13, 12, 11, 10, 9, 8,
+                             7, 6, 5, 4, 3, 2, 1, 0);
+    }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+    
+    
+    
+    boolvec_t as_bool() const { return v; }
+    boolvec_t convert_bool() const
+    {
+      // Result: convert_bool(0)=false, convert_bool(else)=true
+      // There is no intrinsic to compare with zero. Instead, we check
+      // whether x is positive and x-1 is negative.
+      intvec x = *this;
+      // We know that boolvec values depend only on the sign bit
+      // return (~(x-1) | x).as_bool();
+      // return x.as_bool() || !(x-1).as_bool();
+      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+    }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    // Note: not all arithmetic operations are supported!
+    
+    intvec operator+() const { return *this; }
+    intvec operator-() const { return IV(I(0)) - *this; }
+    
+    intvec operator+(intvec x) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      __m128i xvlo = _mm256_castsi256_si128(x.v);
+      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+      vlo = _mm_add_epi8(vlo, xvlo);
+      vhi = _mm_add_epi8(vhi, xvhi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator-(intvec x) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      __m128i xvlo = _mm256_castsi256_si128(x.v);
+      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+      vlo = _mm_sub_epi8(vlo, xvlo);
+      vhi = _mm_sub_epi8(vhi, xvhi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    
+    
+    
+    intvec operator~() const { return IV(~U(0)) ^ *this; }
+    
+    intvec operator&(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    intvec operator|(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+                                              _mm256_castsi256_ps(x.v)));
+    }
+    intvec operator^(intvec x) const
+    {
+      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+                                               _mm256_castsi256_ps(x.v)));
+    }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      uint_t masklo = U(0x00ffU) >> U(n);
+      uint_t maskhi = U(0xff00U);
+      __m128i mask = _mm_set1_epi16(masklo | maskhi);
+      vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask);
+      vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator>>(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      uint_t masklo = U(0x00ffU);
+      uint_t maskhi = U(0xff00U);
+      __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n+8),
+                                    _mm_set1_epi16(masklo));
+      __m128i vlohi = _mm_and_si128(_mm_srai_epi16(vlo, n),
+                                    _mm_set1_epi16(maskhi));
+      vlo = _mm_or_si128(vlolo, vlohi);
+      __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n+8),
+                                    _mm_set1_epi16(masklo));
+      __m128i vhihi = _mm_and_si128(_mm_srai_epi16(vhi, n),
+                                    _mm_set1_epi16(maskhi));
+      vhi = _mm_or_si128(vhilo, vhihi);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec operator<<(int_t n) const
+    {
+      __m128i vlo = _mm256_castsi256_si128(v);
+      __m128i vhi = _mm256_extractf128_si256(v, 1);
+      uint_t masklo = U(0x00ffU);
+      uint_t maskhi = U(0xff00U) << U(n);
+      __m128i mask = _mm_set1_epi16(masklo | maskhi);
+      vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask);
+      vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, U((*this)[i]) >> U(n[i]));
+      }
+      return r;
+    }
+    intvec operator>>(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] >> n[i]);
+      }
+      return r;
+    }
+    intvec operator<<(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] << n[i]);
+      }
+      return r;
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t operator==(intvec const& x) const
+    {
+      return ! (*this != x);
+    }
+    boolvec_t operator!=(intvec const& x) const
+    {
+      return (*this ^ x).convert_bool();
+    }
+  };
+  
+  
+  
+  template<>
+  struct realvec<fp8,32>: floatprops<fp8>
+  {
+    static int const size = 32;
+    typedef real_t scalar_t;
+    typedef __m256i vector_t;
+    static int const alignment = sizeof(vector_t);
+    
+    static char const* name() { return "<AVX:32*fp8>"; }
+    void barrier() { __asm__("": "+x" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(_mm256_set1_epi8(FP::as_int(a))) {}
+    realvec(real_t const* as):
+    v(_mm256_set_epi8(FP::as_int(as[31]),
+                      FP::as_int(as[30]),
+                      FP::as_int(as[29]),
+                      FP::as_int(as[28]),
+                      FP::as_int(as[27]),
+                      FP::as_int(as[26]),
+                      FP::as_int(as[25]),
+                      FP::as_int(as[24]),
+                      FP::as_int(as[23]),
+                      FP::as_int(as[22]),
+                      FP::as_int(as[21]),
+                      FP::as_int(as[20]),
+                      FP::as_int(as[19]),
+                      FP::as_int(as[18]),
+                      FP::as_int(as[17]),
+                      FP::as_int(as[16]),
+                      FP::as_int(as[15]),
+                      FP::as_int(as[14]),
+                      FP::as_int(as[13]),
+                      FP::as_int(as[12]),
+                      FP::as_int(as[11]),
+                      FP::as_int(as[10]),
+                      FP::as_int(as[ 9]),
+                      FP::as_int(as[ 8]),
+                      FP::as_int(as[ 7]),
+                      FP::as_int(as[ 6]),
+                      FP::as_int(as[ 5]),
+                      FP::as_int(as[ 4]),
+                      FP::as_int(as[ 3]),
+                      FP::as_int(as[ 2]),
+                      FP::as_int(as[ 1]),
+                      FP::as_int(as[ 0]))) {}
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return _mm256_load_si256((__m256i const*)p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return _mm256_loadu_si256((__m256i const*)p);
+    }
+    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      _mm256_store_si256((__m256i*)p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      return _mm256_storeu_si256((__m256i*)p, v);
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+        // TODO: this is expensive
+        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        // TODO: this is expensive
+        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+      }
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return v; }
+    intvec_t convert_int() const { __builtin_unreachable(); }
+    
+    
+    
+    realvec operator+() const { __builtin_unreachable(); }
+    realvec operator-() const { __builtin_unreachable(); }
+    
+    realvec operator+(realvec x) const { __builtin_unreachable(); }
+    realvec operator-(realvec x) const { __builtin_unreachable(); }
+    realvec operator*(realvec x) const { __builtin_unreachable(); }
+    realvec operator/(realvec x) const { __builtin_unreachable(); }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const { __builtin_unreachable(); }
+    real_t sum() const { __builtin_unreachable(); }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
+    boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
+    
+    
+    
+    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+    realvec fabs() const { return MF::vml_fabs(*this); }
+    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+    boolvec_t isinf() const { return MF::vml_isinf(*this); }
+    boolvec_t isnan() const { return MF::vml_isnan(*this); }
+    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+    boolvec_t signbit() const { return v; }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<fp8,32>::as_int() const -> intvec_t
+  {
+    return v;
+  }
+  
+  inline
+  auto boolvec<fp8,32>::convert_int() const -> intvec_t
+  {
+    return lsr(as_int(), bits-1);
+  }
+  
+  inline
+  auto boolvec<fp8,32>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return ifthen(x.as_float(), y.as_float()).as_int();
+  }
+  
+  inline
+  auto boolvec<fp8,32>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return (( -convert_int() & x.as_int()) |
+            (~-convert_int() & y.as_int())).as_float();
+  }
+
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<fp8,32>::as_float() const -> realvec_t
+  {
+    return v;
+  }
+  
+  inline auto intvec<fp8,32>::convert_float() const -> realvec_t
+  {
+    __builtin_unreachable();
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_AVX_FP8_32_H
diff --git a/vec_double_avx.h b/vec_double_avx.h
deleted file mode 100644
index cf5e97c..0000000
--- a/vec_double_avx.h
+++ /dev/null
@@ -1,643 +0,0 @@
-// -*-C++-*-
-
-#ifndef VEC_DOUBLE_AVX_H
-#define VEC_DOUBLE_AVX_H
-
-#include "floatprops.h"
-#include "mathfuncs.h"
-#include "vec_base.h"
-
-#include <cmath>
-
-// AVX intrinsics
-#include <immintrin.h>
-
-
-
-namespace vecmathlib {
-  
-#define VECMATHLIB_HAVE_VEC_DOUBLE_4
-  template<> struct boolvec<double,4>;
-  template<> struct intvec<double,4>;
-  template<> struct realvec<double,4>;
-  
-  
-  
-  template<>
-  struct boolvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef __m256d bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a):
-    v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {}
-    boolvec(bool const* as):
-    v(_mm256_castsi256_pd(_mm256_set_epi64x(from_bool(as[3]),
-                                            from_bool(as[2]),
-                                            from_bool(as[1]),
-                                            from_bool(as[0])))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
-    boolvec& set_elt(int n, bool a)
-    {
-      return ((uint_t*)&v)[n]=from_bool(a), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return _mm256_xor_pd(boolvec(true), v); }
-    
-    boolvec operator&&(boolvec x) const { return _mm256_and_pd(v, x.v); }
-    boolvec operator||(boolvec x) const { return _mm256_or_pd(v, x.v); }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const { return _mm256_xor_pd(v, x.v); }
-    
-    bool all() const
-    {
-      // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
-      return ! (! *this).any();
-    }
-    bool any() const
-    {
-      // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
-      return ! _mm256_testz_pd(v, v);
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef __m256i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm256_set1_epi64x(a)) {}
-    intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {}
-    static intvec iota() { return _mm256_set_epi64x(3, 2, 1, 0); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
-    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return _mm256_castsi256_pd(v); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare with zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec x = *this;
-      // We know that boolvec values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(I(0)) - *this; }
-    
-    intvec operator+(intvec x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_add_epi64(vlo, xvlo);
-      vhi = _mm_add_epi64(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator-(intvec x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_sub_epi64(vlo, xvlo);
-      vhi = _mm_sub_epi64(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec operator&(intvec x) const
-    {
-      return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(v),
-                                               _mm256_castsi256_pd(x.v)));
-    }
-    intvec operator|(intvec x) const
-    {
-      return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(v),
-                                              _mm256_castsi256_pd(x.v)));
-    }
-    intvec operator^(intvec x) const
-    {
-      return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(v),
-                                               _mm256_castsi256_pd(x.v)));
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srli_epi64(vlo, n);
-      vhi = _mm_srli_epi64(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator>>(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      // There is no _mm_srai_epi64. To emulate it, add 0x80000000
-      // before shifting, and subtract the shifted 0x80000000 after
-      // shifting
-#if 0
-      __m128i signmask01 = _mm_sub_epi64(_mm_set1_epi64x(0),
-                                         _mm_srli_epi64(vlo, 63));
-      __m128i signmask23 = _mm_sub_epi64(_mm_set1_epi64x(0),
-                                         _mm_srli_epi64(vhi, 63));
-      vlo = _mm_xor_si128(signmask01, vlo);
-      vhi = _mm_xor_si128(signmask23, vhi);
-      vlo = _mm_srli_epi64(vlo, n);
-      vhi = _mm_srli_epi64(vhi, n);
-      vlo = _mm_xor_si128(signmask01, vlo);
-      vhi = _mm_xor_si128(signmask23, vhi);
-#else
-      // Convert signed to unsiged
-      vlo = _mm_add_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1)));
-      vhi = _mm_add_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1)));
-      // Shift
-      vlo = _mm_srli_epi64(vlo, n);
-      vhi = _mm_srli_epi64(vhi, n);
-      // Undo conversion
-      vlo = _mm_sub_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1-n)));
-      vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1-n)));
-#endif
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator<<(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_slli_epi64(vlo, n);
-      vhi = _mm_slli_epi64(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec operator>>(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t signbit() const
-    {
-      return as_bool();
-    }
-    
-    boolvec_t operator==(intvec const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
-    boolvec_t operator<(intvec const& x) const
-    {
-      // return (*this - x).as_bool();
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-    }
-    boolvec_t operator<=(intvec const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec const& x) const
-    {
-      return ! (*this < x);
-    }
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef __m256d vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<AVX:4*double>"; }
-    void barrier() { __asm__("": "+x" (v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm256_set1_pd(a)) {}
-    realvec(real_t const* as): v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
-    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm256_load_pd(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm256_loadu_pd(p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm256_store_pd(p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm256_storeu_pd(p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        _mm256_maskstore_pd(p, m.m.as_int(), v);
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return _mm256_castpd_si256(v); }
-    intvec_t convert_int() const { return MF::vml_convert_int(*this); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return RV(0.0) - *this; }
-    
-    realvec operator+(realvec x) const { return _mm256_add_pd(v, x.v); }
-    realvec operator-(realvec x) const { return _mm256_sub_pd(v, x.v); }
-    realvec operator*(realvec x) const { return _mm256_mul_pd(v, x.v); }
-    realvec operator/(realvec x) const { return _mm256_div_pd(v, x.v); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-    }
-    real_t sum() const
-    {
-      // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-      // __m256d x = _mm256_hadd_pd(v, v);
-      // __m128d xlo = _mm256_extractf128_pd(x, 0);
-      // __m128d xhi = _mm256_extractf128_pd(x, 1);
-      realvec x = *this;
-      x = _mm256_hadd_pd(x.v, x.v);
-      return x[0] + x[2];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ);
-    }
-    boolvec_t operator!=(realvec const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_NEQ_OQ);
-    }
-    boolvec_t operator<(realvec const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ);
-    }
-    boolvec_t operator<=(realvec const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ);
-    }
-    boolvec_t operator>(realvec const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ);
-    }
-    boolvec_t operator>=(realvec const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ);
-    }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const { return _mm256_ceil_pd(v); }
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return MF::vml_fabs(*this); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return _mm256_floor_pd(v); }
-    realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); }
-    realvec fmax(realvec y) const { return _mm256_max_pd(v, y.v); }
-    realvec fmin(realvec y) const { return _mm256_min_pd(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return _mm256_cmp_pd(v, v, _CMP_UNORD_Q); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const
-    {
-      return _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
-    }
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return v; }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return _mm256_sqrt_pd(v); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const { return _mm256_round_pd(v, _MM_FROUND_TO_ZERO); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline
-  auto boolvec<double,4>::as_int() const -> intvec_t
-  {
-    return _mm256_castpd_si256(v);
-  }
-  
-  inline
-  auto boolvec<double,4>::convert_int() const -> intvec_t
-  {
-    //return ifthen(v, U(1), U(0));
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  auto boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  auto boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
-  {
-    return _mm256_blendv_pd(y.v, x.v, v);
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline auto intvec<double,4>::as_float() const -> realvec_t
-  {
-    return _mm256_castsi256_pd(v);
-  }
-  
-  inline auto intvec<double,4>::convert_float() const -> realvec_t
-  {
-    return MF::vml_convert_float(*this);
-  }
-  
-} // namespace vecmathlib
-
-#endif  // #ifndef VEC_DOUBLE_AVX_H
diff --git a/vec_double_qpx.h b/vec_double_qpx.h
deleted file mode 100644
index 8ffdf67..0000000
--- a/vec_double_qpx.h
+++ /dev/null
@@ -1,667 +0,0 @@
-// -*-C++-*-
-
-#ifndef VEC_DOUBLE_QPX_H
-#define VEC_DOUBLE_QPX_H
-
-#include "floatprops.h"
-#include "mathfuncs.h"
-#include "vec_base.h"
-
-#include <cmath>
-#warning "TODO"
-#include <iostream>
-
-// QPX intrinsics
-#ifdef __clang__
-#  include <qpxintrin.h>
-#else
-#  include <builtins.h>
-#endif
-#include <mass_simd.h>
-
-
-
-namespace vecmathlib {
-  
-#define VECMATHLIB_HAVE_VEC_DOUBLE_4
-  template<> struct boolvec<double,4>;
-  template<> struct intvec<double,4>;
-  template<> struct realvec<double,4>;
-  
-  
-  
-  template<>
-  struct boolvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef vector4double bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // canonical true is +1.0, canonical false is -1.0
-    // >=0 is true, -0 is true, nan is false
-    static real_t from_bool(bool a) { return a ? +1.0 : -1.0; }
-    static bool to_bool(real_t a) { return a>=0.0; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(vec_splats(from_bool(a))) {}
-    boolvec(bool const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      // return to_bool(((real_t const*)&v)[n]);
-      return to_bool(v[n]);
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      // return ((real_t*)&v)[n]=from_bool(a), *this;
-      return v[n]=from_bool(a), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vec_not(v); }
-    
-    boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
-    boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
-    boolvec operator==(boolvec x) const { return vec_logical(v, x.v, 0x9); }
-    boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
-    
-    bool all() const
-    {
-      return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
-    }
-    bool any() const
-    {
-      return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef vector4double ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vec_splats(FP::as_float(a))) {}
-    intvec(int_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota()
-    {
-      const int_t iota_[] = {0, 1, 2, 4};
-      return intvec(iota_);
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      // return ((int_t const*)&v)[n];
-      return FP::as_int(v[n]);
-    }
-    intvec& set_elt(int n, int_t a)
-    {
-      // return ((int_t*)&v)[n]=a, *this;
-      return v[n]=FP::as_float(a), *this;
-    }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return v; }
-    boolvec_t convert_bool() const { return *this != IV(I(0)); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, -(*this)[d]);
-      return r;
-    }
-    
-    intvec operator+(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] + x[d]);
-      return r;
-    }
-    intvec operator-(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] - x[d]);
-      return r;
-    }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, ~(*this)[d]);
-      return r;
-    }
-    
-    intvec operator&(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] & x[d]);
-      return r;
-    }
-    intvec operator|(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] | x[d]);
-      return r;
-    }
-    intvec operator^(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] ^ x[d]);
-      return r;
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, U((*this)[d]) >> U(n));
-      return r;
-    }
-    intvec operator>>(int_t n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >> n);
-      return r;
-    }
-    intvec operator<<(int_t n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] << n);
-      return r;
-    }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, U((*this)[d]) >> U(n[d]));
-      return r;
-    }
-    intvec operator>>(intvec n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >> n[d]);
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] << n[d]);
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t signbit() const
-    {
-      return *this < IV(I(0));
-    }
-    
-    boolvec_t operator==(intvec const& x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] == x[d]);
-      return r;
-    }
-    boolvec_t operator!=(intvec const& x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] != x[d]);
-      return r;
-    }
-    boolvec_t operator<(intvec const& x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] < x[d]);
-      return r;
-    }
-    boolvec_t operator<=(intvec const& x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] <= x[d]);
-      return r;
-    }
-    boolvec_t operator>(intvec const& x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] > x[d]);
-      return r;
-    }
-    boolvec_t operator>=(intvec const& x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >= x[d]);
-      return r;
-    }
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef vector4double vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<QPX:4*double>"; }
-    void barrier() { __asm__("": "+v" (v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vec_splats(a)) {}
-    realvec(real_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      // return ((real_t const*)&v)[n];
-      return v[n];
-    }
-    realvec& set_elt(int n, real_t a)
-    {
-      // return ((real_t*)&v)[n]=a, *this;
-      return v[n]=a, *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vec_lda(0, (real_t*)p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      realvec_t v0 = vec_ld(0, (real_t*)p);
-      realvec_t v1 = vec_ld(31, (real_t*)p);
-      return vec_perm(v0.v, v1.v, vec_lvsl(0, (real_t*)p));
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      // TODO: use load instruction with fixed offset
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      // TODO: use load instruction with fixed offset
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-#warning "TODO"
-      std::cout << "yes this is storea\n";
-      vec_sta(v, 0, p);
-    }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
-      // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
-      p[2] = (*this)[2];
-      p[3] = (*this)[3];
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return v; }
-    intvec_t convert_int() const { return vec_ctid(v); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return vec_neg(v); }
-    
-    realvec operator+(realvec x) const { return vec_add(v, x.v); }
-    realvec operator-(realvec x) const { return vec_sub(v, x.v); }
-    realvec operator*(realvec x) const { return vec_mul(v, x.v); }
-    realvec operator/(realvec x) const
-    {
-      // return vec_swdiv_nochk(v, x.v);
-      return div_fastd4(v, x.v);
-    }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-    }
-    real_t sum() const
-    {
-      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); }
-    boolvec_t operator!=(realvec const& x) const { return ! (*this == x); }
-    boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); }
-    boolvec_t operator<=(realvec const& x) const { return ! (*this > x); }
-    boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); }
-    boolvec_t operator>=(realvec const& x) const { return ! (*this < x); }
-    
-    
-    
-    realvec acos() const { return acosd4(v); }
-    realvec acosh() const { return acoshd4(v); }
-    realvec asin() const { return asind4(v); }
-    realvec asinh() const { return asinhd4(v); }
-    realvec atan() const { return atand4(v); }
-    realvec atan2(realvec y) const { return atan2d4(v, y.v); }
-    realvec atanh() const { return atanhd4(v); }
-    realvec cbrt() const { return cbrtd4(v); }
-    realvec ceil() const { return vec_ceil(v); }
-    realvec copysign(realvec y) const { return vec_cpsgn(v, y.v); }
-    realvec cos() const { return cosd4(v); }
-    realvec cosh() const { return coshd4(v); }
-    realvec exp() const { return expd4(v); }
-    realvec exp10() const { return exp10d4(v); }
-    realvec exp2() const { return exp2d4(v); }
-    realvec expm1() const { return expm1d4(v); }
-    realvec fabs() const { return vec_abs(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return vec_floor(v); }
-    realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
-    realvec fmax(realvec y) const { return MF::vml_fmax(v, y.v); }
-    realvec fmin(realvec y) const { return MF::vml_fmin(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec hypot(realvec y) const { return hypotd4(v, y.v); }
-    intvec_t ilogb() const
-    {
-      int_t ilogb_[] = {
-	::ilogb((*this)[0]),
-	::ilogb((*this)[1]),
-	::ilogb((*this)[2]),
-	::ilogb((*this)[3])
-      };
-      return intvec_t(ilogb_);
-    }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return vec_tstnan(v, v); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return ldexp(intvec_t(n)); }
-    realvec ldexp(intvec_t n) const
-    {
-      real_t ldexp_[] = {
-	std::ldexp((*this)[0], n[0]),
-        std::ldexp((*this)[1], n[1]),
-        std::ldexp((*this)[2], n[2]),
-        std::ldexp((*this)[3], n[3])
-      };
-      return realvec_t(ldexp_);
-    }
-    realvec log() const { return logd4(v); }
-    realvec log10() const { return log10d4(v); }
-    realvec log1p() const { return log1pd4(v); }
-    realvec log2() const { return log2d4(v); }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return powd4(v, y.v); }
-    realvec rcp() const { return recip_fastd4(v); }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const { return MF::vml_rint(*this); }
-    realvec round() const { return vec_round(v); }
-    realvec rsqrt() const
-    {
-      realvec x = *this;
-      realvec r = vec_rsqrte(x.v); // this is only an approximation
-      // TODO: use fma
-      // one Newton iteration (see vml_rsqrt)
-      r += RV(0.5)*r * (RV(1.0) - x * r*r);
-      return r;
-    }
-    boolvec_t signbit() const { return !copysign(RV(1.0)).as_int().as_bool(); }
-    realvec sin() const { return sind4(v); }
-    realvec sinh() const { return sinhd4(v); }
-    realvec sqrt() const
-    {
-      // return vec_sqrtsw_nochk(v);
-      return *this * rsqrt();
-    }
-    realvec tan() const { return tand4(v); }
-    realvec tanh() const { return tanhd4(v); }
-    realvec trunc() const { return vec_trunc(v); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline
-  boolvec<double,4>::intvec_t boolvec<double,4>::as_int() const
-  {
-    return v;
-  }
-  
-  inline
-  boolvec<double,4>::intvec_t boolvec<double,4>::convert_int() const
-  {
-    return ifthen(IV(I(1)), IV(I(0)));
-  }
-  
-  inline
-  boolvec<double,4>::intvec_t boolvec<double,4>::ifthen(intvec_t x,
-                                                        intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  boolvec<double,4>::realvec_t boolvec<double,4>::ifthen(realvec_t x,
-                                                         realvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline intvec<double,4>::realvec_t intvec<double,4>::as_float() const
-  {
-    return v;
-  }
-  
-  inline intvec<double,4>::realvec_t intvec<double,4>::convert_float() const
-  {
-    return vec_cfid(v);
-  }
-  
-} // namespace vecmathlib
-
-#endif  // #ifndef VEC_DOUBLE_QPX_H
diff --git a/vec_double_sse2.h b/vec_double_sse2.h
deleted file mode 100644
index 7c31d2d..0000000
--- a/vec_double_sse2.h
+++ /dev/null
@@ -1,646 +0,0 @@
-// -*-C++-*-
-
-#ifndef VEC_DOUBLE_SSE2_H
-#define VEC_DOUBLE_SSE2_H
-
-#include "floatprops.h"
-#include "mathfuncs.h"
-#include "vec_base.h"
-
-#include <cmath>
-
-// SSE2 intrinsics
-#include <emmintrin.h>
-#ifdef __SSE3__                 // Intel's SSE 3
-#  include <pmmintrin.h>
-#endif
-#ifdef __SSE4_1__               // Intel's SSE 4.1
-#  include <smmintrin.h>
-#endif
-#ifdef __SSE4A__                // AMD's SSE 4a
-#  include <ammintrin.h>
-#endif
-#if defined __AVX__             // Intel's AVX
-#  include <immintrin.h>
-#endif
-
-
-
-namespace vecmathlib {
-  
-#define VECMATHLIB_HAVE_VEC_DOUBLE_2
-  template<> struct boolvec<double,2>;
-  template<> struct intvec<double,2>;
-  template<> struct realvec<double,2>;
-  
-  
-  
-  template<>
-  struct boolvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef bool scalar_t;
-    typedef __m128d bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a):
-    v(_mm_castsi128_pd(_mm_set1_epi64x(from_bool(a)))) {}
-    boolvec(bool const* as):
-    v(_mm_castsi128_pd(_mm_set_epi64x(from_bool(as[1]), from_bool(as[0])))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
-    boolvec& set_elt(int n, bool a)
-    {
-      return ((uint_t*)&v)[n]=from_bool(a), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return _mm_xor_pd(boolvec(true), v); }
-    
-    boolvec operator&&(boolvec x) const { return _mm_and_pd(v, x.v); }
-    boolvec operator||(boolvec x) const { return _mm_or_pd(v, x.v); }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const { return _mm_xor_pd(v, x.v); }
-    
-    bool all() const
-    {
-      // return (*this)[0] && (*this)[1];
-#if defined __AVX__
-      return ! (! *this).any();
-#else
-      boolvec x = *this;
-      x = x && _mm_shuffle_pd(x.v, x.v, _MM_SHUFFLE2(0,1));
-      return x[0];
-#endif
-    }
-    bool any() const
-    {
-      // return (*this)[0] || (*this)[1];
-#if defined __AVX__
-      return ! _mm_testz_pd(v, v);
-#else
-      boolvec x = *this;
-      x = x || _mm_shuffle_pd(x.v, x.v, _MM_SHUFFLE2(0,1));
-      return x[0];
-#endif
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef int_t scalar_t;
-    typedef __m128i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm_set1_epi64x(a)) {}
-    intvec(int_t const* as): v(_mm_set_epi64x(as[1], as[0])) {}
-    static intvec iota() { return _mm_set_epi64x(1, 0); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
-    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return _mm_castsi128_pd(v); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare with zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec x = *this;
-      // We know that boolvec values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(I(0)) - *this; }
-    
-    intvec operator+(intvec x) const { return _mm_add_epi64(v, x.v); }
-    intvec operator-(intvec x) const { return _mm_sub_epi64(v, x.v); }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec operator&(intvec x) const
-    {
-      return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(v),
-                                         _mm_castsi128_pd(x.v)));
-    }
-    intvec operator|(intvec x) const
-    {
-      return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(v),
-                                        _mm_castsi128_pd(x.v)));
-    }
-    intvec operator^(intvec x) const
-    {
-      return _mm_castpd_si128(_mm_xor_pd(_mm_castsi128_pd(v),
-                                         _mm_castsi128_pd(x.v)));
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const { return _mm_srli_epi64(v, n); }
-    intvec operator>>(int_t n) const
-    {
-      // There is no _mm_srai_epi64. To emulate it, add 0x80000000
-      // before shifting, and subtract the shifted 0x80000000 after
-      // shifting
-      intvec x = *this;
-      // Convert signed to unsiged
-      x += U(1) << (bits-1);
-      // Shift
-      x = x.lsr(n);
-      // Undo conversion
-      x -= U(1) << (bits-1-n);
-      return x;
-    }
-    intvec operator<<(int_t n) const { return _mm_slli_epi64(v, n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec operator>>(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t signbit() const
-    {
-      return as_bool();
-    }
-    
-    boolvec_t operator==(intvec const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
-    boolvec_t operator<(intvec const& x) const
-    {
-      // return (*this - x).as_bool();
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-    }
-    boolvec_t operator<=(intvec const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec const& x) const
-    {
-      return ! (*this < x);
-    }
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef real_t scalar_t;
-    typedef __m128d vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<SSE2:2*double>"; }
-    void barrier() { __asm__("": "+x" (v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm_set1_pd(a)) {}
-    realvec(real_t const* as): v(_mm_set_pd(as[1], as[0])) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
-    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm_load_pd(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm_loadu_pd(p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm_store_pd(p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm_storeu_pd(p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-#if defined __AVX__
-        _mm_maskstore_pd(p, m.m.as_int(), v);
-#else
-        if      (m.m[0]) _mm_storel_pd(p  , v);
-        else if (m.m[1]) _mm_storeh_pd(p+1, v);
-#endif
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if      (m.m[0]) _mm_storel_pd(p  , v);
-        else if (m.m[1]) _mm_storeh_pd(p+1, v);
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return _mm_castpd_si128(v); }
-    intvec_t convert_int() const { return MF::vml_convert_int(*this); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return RV(0.0) - *this; }
-    
-    realvec operator+(realvec x) const { return _mm_add_pd(v, x.v); }
-    realvec operator-(realvec x) const { return _mm_sub_pd(v, x.v); }
-    realvec operator*(realvec x) const { return _mm_mul_pd(v, x.v); }
-    realvec operator/(realvec x) const { return _mm_div_pd(v, x.v); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1];
-    }
-    real_t sum() const
-    {
-#ifdef __SSE3__
-      return _mm_cvtsd_f64(_mm_hadd_pd(v, v));
-#else
-      return (*this)[0] + (*this)[1];
-#endif
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const
-    {
-      return _mm_cmpeq_pd(v, x.v);
-    }
-    boolvec_t operator!=(realvec const& x) const
-    {
-      return _mm_cmpneq_pd(v, x.v);
-    }
-    boolvec_t operator<(realvec const& x) const
-    {
-      return _mm_cmplt_pd(v, x.v);
-    }
-    boolvec_t operator<=(realvec const& x) const
-    {
-      return _mm_cmple_pd(v, x.v);
-    }
-    boolvec_t operator>(realvec const& x) const
-    {
-      return _mm_cmpgt_pd(v, x.v);
-    }
-    boolvec_t operator>=(realvec const& x) const
-    {
-      return _mm_cmpge_pd(v, x.v);
-    }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const
-    {
-#ifdef __SSE4_1__
-      return _mm_ceil_pd(v);
-#else
-      return MF::vml_ceil(*this);
-#endif
- }
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return MF::vml_fabs(*this); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const
-    {
-#ifdef __SSE4_1__
-      return _mm_floor_pd(v);
-#else
-      return MF::vml_floor(*this);
-#endif
- }
-    realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); }
-    realvec fmax(realvec y) const { return _mm_max_pd(v, y.v); }
-    realvec fmin(realvec y) const { return _mm_min_pd(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return _mm_cmpunord_pd(v, v);; }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const { return _mm_div_pd(_mm_set1_pd(1.0), v); }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const
-    {
-#ifdef __SSE4_1__
-      return _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
-#else
-      return MF::vml_rint(*this);
-#endif
-    }
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return v; }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return _mm_sqrt_pd(v); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const
-    {
-#ifdef __SSE4_1__
-      return _mm_round_pd(v, _MM_FROUND_TO_ZERO);
-#else
-      return MF::vml_trunc(*this);
-#endif
- }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline
-  auto boolvec<double,2>::as_int() const -> intvec_t
-  {
-    return _mm_castpd_si128(v);
-  }
-  
-  inline
-  auto boolvec<double,2>::convert_int() const -> intvec_t
-  {
-    //return ifthen(v, U(1), U(0));
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  auto boolvec<double,2>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  auto boolvec<double,2>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
-  {
-#ifdef __SSE4_1__
-    return _mm_blendv_pd(y.v, x.v, v);
-#else
-    return (( -convert_int() & x.as_int()) |
-            (~-convert_int() & y.as_int())).as_float();
-#endif
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline auto intvec<double,2>::as_float() const -> realvec_t
-  {
-    return _mm_castsi128_pd(v);
-  }
-  
-  inline auto intvec<double,2>::convert_float() const -> realvec_t
-  {
-    return MF::vml_convert_float(*this);
-  }
-  
-} // namespace vecmathlib
-
-#endif  // #ifndef VEC_DOUBLE_SSE2_H
diff --git a/vec_double_sse2_scalar.h b/vec_double_sse2_scalar.h
deleted file mode 100644
index 4c3b4b6..0000000
--- a/vec_double_sse2_scalar.h
+++ /dev/null
@@ -1,528 +0,0 @@
-// -*-C++-*-
-
-#ifndef VEC_DOUBLE_SSE2_SCALAR_H
-#define VEC_DOUBLE_SSE2_SCALAR_H
-
-#include "floatprops.h"
-#include "mathfuncs.h"
-#include "vec_base.h"
-
-#include <cmath>
-
-// SSE2 intrinsics
-#include <emmintrin.h>
-#ifdef __SSE3__                 // Intel's SSE 3
-#  include <pmmintrin.h>
-#endif
-#ifdef __SSE4_1__               // Intel's SSE 4.1
-#  include <smmintrin.h>
-#endif
-#ifdef __SSE4A__                // AMD's SSE 4a
-#  include <ammintrin.h>
-#endif
-#if defined __AVX__             // Intel's AVX
-#  include <immintrin.h>
-#endif
-
-
-
-namespace vecmathlib {
-  
-#define VECMATHLIB_HAVE_VEC_DOUBLE_1
-  template<> struct boolvec<double,1>;
-  template<> struct intvec<double,1>;
-  template<> struct realvec<double,1>;
-  
-  
-  
-  template<>
-  struct boolvec<double,1>: floatprops<double>
-  {
-    static int const size = 1;
-    typedef bool scalar_t;
-    typedef uint_t bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-    // true values are non-zero, false values are zero
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(a) {}
-    // TODO: remove this
-    boolvec(int x): v(x) {}
-    boolvec(bool const* as): v(as[0]) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return v; }
-    boolvec& set_elt(int n, bool a) { return v=a, *this; }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return !v; }
-    
-    boolvec operator&&(boolvec x) const { return v && x.v; }
-    boolvec operator||(boolvec x) const { return v || x.v; }
-    boolvec operator==(boolvec x) const { return bool(v) == bool(x.v); }
-    boolvec operator!=(boolvec x) const { return bool(v) != bool(x.v); }
-    
-    bool all() const { return v; }
-    bool any() const { return v; }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,1>: floatprops<double>
-  {
-    static int const size = 1;
-    typedef int_t scalar_t;
-    typedef int_t ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(int_t a): v(a) {}
-    intvec(int_t const* as): v(as[0]) {}
-    static intvec iota() { return intvec(I(0)); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return v; }
-    intvec& set_elt(int n, int_t a) { return v=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return U(v); }
-    boolvec_t convert_bool() const { return bool(v); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return +v; }
-    intvec operator-() const { return -v; }
-    
-    intvec operator+(intvec x) const { return v+x.v; }
-    intvec operator-(intvec x) const { return v-x.v; }
-    intvec operator*(intvec x) const { return v*x.v; }
-    intvec operator/(intvec x) const { return v/x.v; }
-    intvec operator%(intvec x) const { return v%x.v; }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    intvec& operator*=(intvec const& x) { return *this=*this*x; }
-    intvec& operator/=(intvec const& x) { return *this=*this/x; }
-    intvec& operator%=(intvec const& x) { return *this=*this%x; }
-    
-    
-    
-    intvec operator~() const { return ~v; }
-    
-    intvec operator&(intvec x) const { return v&x.v; }
-    intvec operator|(intvec x) const { return v|x.v; }
-    intvec operator^(intvec x) const { return v^x.v; }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const { return U(v) >> U(n); }
-    intvec operator>>(int_t n) const { return v>>n; }
-    intvec operator<<(int_t n) const { return v<<n; }
-    
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const { return U(v) >> U(n); }
-    intvec operator>>(intvec n) const { return v>>n; }
-    intvec operator<<(intvec n) const { return v<<n; }
-    
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t signbit() const { return *this < IV(I(0)); }
-    
-    boolvec_t operator==(intvec const& x) const { return v==x.v; }
-    boolvec_t operator!=(intvec const& x) const { return v!=x.v; }
-    boolvec_t operator<(intvec const& x) const { return v<x.v; }
-    boolvec_t operator<=(intvec const& x) const { return v<=x.v; }
-    boolvec_t operator>(intvec const& x) const { return v>x.v; }
-    boolvec_t operator>=(intvec const& x) const { return v>=x.v; }
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,1>: floatprops<double>
-  {
-    static int const size = 1;
-    typedef real_t scalar_t;
-    typedef double vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<SSE2:1*double>"; }
-    void barrier() { __asm__("": "+x" (v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-  private:
-    static __m128d from_double(double a) { return _mm_set_sd(a); }
-    static double to_double(__m128d a) { return _mm_cvtsd_f64(a); }
-  public:
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(real_t a): v(a) {}
-    realvec(real_t const* as): v(as[0]) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return v; }
-    realvec& set_elt(int n, real_t a) { return v=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return *p;
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return *p;
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loada(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return *this;
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return *this;
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loada(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      *p = v;
-    }
-    void storeu(real_t* p) const
-    {
-      *p = v;
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storea(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storea(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return floatprops::as_int(v); }
-    intvec_t convert_int() const {
-#ifdef __x86_64__
-      return _mm_cvttsd_si64(_mm_set_sd(v));
-#else
-      return floatprops::convert_int(v);
-#endif
-    }
-    
-    
-    
-    realvec operator+() const { return +v; }
-    realvec operator-() const { return -v; }
-    
-    realvec operator+(realvec x) const { return v+x.v; }
-    realvec operator-(realvec x) const { return v-x.v; }
-    realvec operator*(realvec x) const { return v*x.v; }
-    realvec operator/(realvec x) const { return v/x.v; }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t prod() const { return v; }
-    real_t sum() const { return v; }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return v==x.v; }
-    boolvec_t operator!=(realvec const& x) const { return v!=x.v; }
-    boolvec_t operator<(realvec const& x) const { return v<x.v; }
-    boolvec_t operator<=(realvec const& x) const { return v<=x.v; }
-    boolvec_t operator>(realvec const& x) const { return v>x.v; }
-    boolvec_t operator>=(realvec const& x) const { return v>=x.v; }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const
-    {
-#ifdef __SSE4_1__
-      return to_double(_mm_ceil_sd(from_double(v), from_double(v)));
-#else
-      return std::ceil(v);
-#endif
-    }
-    realvec copysign(realvec y) const { return std::copysign(v, y.v); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return std::fabs(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const
-    {
-#ifdef __SSE4_1__
-      return to_double(_mm_floor_sd(from_double(v), from_double(v)));
-#else
-      return std::floor(v);
-#endif
-    }
-    realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); }
-    realvec fmax(realvec y) const
-    {
-      return to_double(_mm_max_sd(from_double(v), from_double(y.v)));
-    }
-    realvec fmin(realvec y) const
-    {
-      return to_double(_mm_min_sd(from_double(v), from_double(y.v)));
-    }
-    realvec fmod(realvec y) const { return std::fmod(v, y.v); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const
-    {
-      int_t r = std::ilogb(v);
-      if (r == FP_ILOGB0) r = numeric_limits<int_t>::min();
-      else if (r == FP_ILOGBNAN) r = numeric_limits<int_t>::max();
-      return r;
-    }
-    boolvec_t isfinite() const { return std::isfinite(v); }
-    boolvec_t isinf() const { return std::isinf(v); }
-    boolvec_t isnan() const
-    {
-      return _mm_ucomineq_sd(from_double(v), from_double(v));
-    }
-    boolvec_t isnormal() const { return std::isnormal(v); }
-    realvec ldexp(int_t n) const { return std::ldexp(v, n); }
-    realvec ldexp(intvec_t n) const { return std::ldexp(v, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const { return R(1.0)/v; }
-    realvec remainder(realvec y) const { return std::remainder(v, y.v); }
-    realvec rint() const
-    {
-#ifdef __SSE4_1__
-      return to_double(_mm_round_sd(from_double(v), from_double(v),
-                                    _MM_FROUND_TO_NEAREST_INT));
-#else
-      return MF::vml_rint(*this);
-#endif
-    }
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return std::signbit(v); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const
-    {
-      return to_double(_mm_sqrt_sd(from_double(v), from_double(v)));
-    }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const
-    {
-#ifdef __SSE4_1__
-      return to_double(_mm_round_sd(from_double(v), from_double(v),
-                                    _MM_FROUND_TO_ZERO));
-#else
-      return MF::vml_trunc(*this);
-#endif
-    }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline
-  auto boolvec<double,1>::as_int() const -> intvec_t
-  {
-    return I(v);
-  }
-  
-  inline
-  auto boolvec<double,1>::convert_int() const -> intvec_t
-  {
-    return v;
-  }
-  
-  inline
-  auto boolvec<double,1>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
-  {
-    return v ? x : y;
-  }
-  
-  inline
-  auto boolvec<double,1>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
-  {
-    return v ? x : y;
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline auto intvec<double,1>::as_float() const -> realvec_t
-  {
-    return FP::as_float(v);
-  }
-  
-  inline auto intvec<double,1>::convert_float() const -> realvec_t
-  {
-#ifdef __x86_64__
-    return _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(), v));
-#else
-    return FP::convert_float(v);
-#endif
-  }
-  
-} // namespace vecmathlib
-
-#endif  // #ifndef VEC_DOUBLE_SSE2_SCALAR_H
diff --git a/vec_double_vsx.h b/vec_double_vsx.h
deleted file mode 100644
index 67b4b36..0000000
--- a/vec_double_vsx.h
+++ /dev/null
@@ -1,656 +0,0 @@
-// -*-C++-*-
-
-#ifndef VEC_DOUBLE_VSX_H
-#define VEC_DOUBLE_VSX_H
-
-#include "floatprops.h"
-#include "mathfuncs.h"
-#include "vec_base.h"
-
-#include <cmath>
-
-// VSX intrinsics
-#include <altivec.h>
-#undef vector
-#undef pixel
-#undef bool
-
-
-
-namespace vecmathlib {
-  
-#define VECMATHLIB_HAVE_VEC_DOUBLE_2
-  template<> struct boolvec<double,2>;
-  template<> struct intvec<double,2>;
-  template<> struct realvec<double,2>;
-  
-  
-  
-  template<>
-  struct boolvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef bool scalar_t;
-    typedef __vector __bool long long bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values are -1, false values are 0
-    // truth values are interpreted bit-wise
-    static uint_t from_bool(bool a) { return -int_t(a); }
-    static bool to_bool(uint_t a) { return a; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(vec_splats(from_bool(a))) {}
-    boolvec(bool const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
-    boolvec& set_elt(int n, bool a)
-    {
-      return ((uint_t*)&v)[n]=from_bool(a), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const
-    {
-      return
-	(__vector __bool long long)(__vector long long)
-	vec_nor((__vector double)(__vector long long)v,
-		(__vector double)(__vector long long)v);
-    }
-    
-    boolvec operator&&(boolvec x) const
-    {
-      return
-	(__vector __bool long long)(__vector long long)
-	vec_and((__vector double)(__vector long long)v,
-		(__vector double)(__vector long long)x.v);
-    }
-    boolvec operator||(boolvec x) const
-    {
-      return
-	(__vector __bool long long)(__vector long long)
-	vec_or((__vector double)(__vector long long)v,
-	       (__vector double)(__vector long long)x.v);
-    }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const
-    {
-      return
-	(__vector __bool long long)(__vector long long)
-	vec_xor((__vector double)(__vector long long)v,
-		(__vector double)(__vector long long)x.v);
-    }
-    
-    bool all() const
-    {
-      return vec_all_ne((__vector int)v, (__vector int)BV(false).v);
-    }
-    bool any() const
-    {
-      return vec_any_ne((__vector int)v, (__vector int)BV(false).v);
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef int_t scalar_t;
-    typedef __vector long long ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vec_splats(a)) {}
-    intvec(int_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota() { return (__vector long long){0, 1}; }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
-    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return (__vector __bool long long)v; }
-    boolvec_t convert_bool() const { return *this != IV(I(0)); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Permutation control words
-  private:
-    // 0123 4567 -> 1436
-    // exchange pairs
-    static __vector unsigned char perm_int_swap()
-    {
-      return
-	(__vector unsigned char)
-	{4,5,6,7, 16,17,18,19, 12,13,14,15, 24,25,26,27};
-    }
-    // 0123 4567 -> 0426
-    // broadcast high elements of pairs
-    static __vector unsigned char perm_int_bchi()
-    {
-      return
-	(__vector unsigned char)
-	{0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
-    }
-  public:
-    
-    
-
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(I(0)) - *this; }
-    
-    intvec operator+(intvec x) const
-    {
-      // return vec_add(v, x.v);
-      __vector unsigned int a = (__vector unsigned int)v;
-      __vector unsigned int b = (__vector unsigned int)x.v;
-      __vector unsigned int s = vec_add(a, b);
-      __vector unsigned int c = vec_addc(a, b);
-      __vector unsigned int z = vec_xor(z, z);
-      c = vec_perm(c, z, perm_int_swap());
-      s = vec_add(s, c);
-      return (__vector long long)s;
-    }
-    intvec operator-(intvec x) const
-    {
-      // return vec_sub(v, x.v);
-      __vector unsigned int a = (__vector unsigned int)v;
-      __vector unsigned int b = (__vector unsigned int)x.v;
-      __vector unsigned int d = vec_sub(a, b);
-      __vector unsigned int c = vec_subc(a, b);
-      c = vec_sub(vec_splats(1U), c);
-      __vector unsigned int z = vec_xor(z, z);
-      c = vec_perm(c, z, perm_int_swap());
-      d = vec_sub(d, c);
-      return (__vector long long)d;
-    }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const
-    {
-      return (__vector long long)vec_nor((__vector int)v, (__vector int)v);
-    }
-    
-    intvec operator&(intvec x) const
-    {
-      return (__vector long long)vec_and((__vector int)v, (__vector int)x.v);
-    }
-    intvec operator|(intvec x) const
-    {
-      return (__vector long long)vec_or ((__vector int)v, (__vector int)x.v);
-    }
-    intvec operator^(intvec x) const
-    {
-      return (__vector long long)vec_xor((__vector int)v, (__vector int)x.v);
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const { return lsr(IV(n)); }
-    intvec operator>>(int_t n) const { return *this >> IV(n); }
-    intvec operator<<(int_t n) const { return *this << IV(n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      // return vec_sr(v, (__vector unsigned long long)n.v);
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec operator>>(intvec n) const
-    {
-      // return vec_sra(v, (__vector unsigned long long)n.v);
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      // return vec_sl(v, (__vector unsigned long long)n.v);
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t signbit() const
-    {
-      return (*this >> (bits-1)).as_bool();
-    }
-    
-    boolvec_t operator==(intvec const& x) const
-    {
-      // return vec_cmpeq(v, x.v);
-      __vector int a = (__vector int)v;
-      __vector int b = (__vector int)x.v;
-      __vector __bool int c = vec_cmpeq(a, b);
-      __vector __bool int cx = vec_perm(c, c, perm_int_swap());
-      __vector __bool int r = vec_and(c, cx);
-      return (__vector __bool long long)r;
-    }
-    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(intvec const& x) const
-    {
-      __vector int a = (__vector int)v;
-      __vector int b = (__vector int)x.v;
-      __vector __bool int lt = vec_cmplt(a, b);
-      __vector __bool int eq = vec_cmpeq(a, b);
-      __vector unsigned int ua = (__vector unsigned int)v;
-      __vector unsigned int ub = (__vector unsigned int)x.v;
-      __vector __bool int ult = vec_cmplt(ua, ub);
-      __vector __bool int ultx = vec_perm(ult, ult, perm_int_swap());
-      __vector __bool int r = vec_or(lt, vec_and(eq, ultx));
-      r = vec_perm(r, r, perm_int_bchi());
-      return (__vector __bool long long)r;
-    }
-    boolvec_t operator<=(intvec const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec const& x) const
-    {
-      return ! (*this < x);
-    }
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef real_t scalar_t;
-    typedef __vector double vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<VSX:2*double>"; }
-    void barrier() { __asm__("": "+v" (v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vec_splats(a)) {}
-    realvec(real_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
-    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vec_ld(0, (const __vector double*)p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      realvec_t v0 = vec_ld(0, (const __vector double*)p);
-      realvec_t v1 = vec_ld(15, (const __vector double*)p);
-      return vec_perm(v0.v, v1.v, vec_lvsl(0, p));
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vec_st(v, 0, (__vector double*)p);
-    }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
-      // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-	// Use vec_ste?
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-	// Use vec_ste?
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return (__vector long long) v; }
-    intvec_t convert_int() const { return MF::vml_convert_int(*this); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return RV(0.0) - *this; }
-    
-    realvec operator+(realvec x) const { return vec_add(v, x.v); }
-    realvec operator-(realvec x) const { return vec_sub(v, x.v); }
-    realvec operator*(realvec x) const { return vec_mul(v, x.v); }
-    realvec operator/(realvec x) const { return vec_div(v, x.v); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-    }
-    real_t sum() const
-    {
-      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); }
-    boolvec_t operator!=(realvec const& x) const { return ! (*this == x); }
-    boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); }
-    boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); }
-    boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); }
-    boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const { return vec_ceil(v); }
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return vec_abs(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return vec_floor(v); }
-    realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
-    realvec fmax(realvec y) const { return vec_max(v, y.v); }
-    realvec fmin(realvec y) const { return vec_min(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec x = *this;
-      realvec r = vec_re(v);    // this is only an approximation
-      // TODO: use fma
-      // Note: don't rewrite this expression, this may introduce
-      // cancellation errors
-      r += r * (RV(1.0) - x*r); // two Newton iterations (see vml_rcp)
-      r += r * (RV(1.0) - x*r);
-      return r;
-    }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const { return vec_rint(v); }
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const
-    {
-      // realvec x = *this;
-      // realvec r = vec_rsqrte(x.v); // this is only an approximation
-      // // TODO: use fma
-      // // one Newton iteration (see vml_rsqrt)
-      // r += RV(0.5)*r * (RV(1.0) - x * r*r);
-      // return r;
-      return vec_rsqrt(v);
-    }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    // realvec sqrt() const { return *this * rsqrt(); }
-    realvec sqrt() const { return vec_sqrt(v); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const { return vec_trunc(v); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline
-  auto boolvec<double,2>::as_int() const -> intvec_t
-  {
-    return (__vector long long) v;
-  }
-  
-  inline
-  auto boolvec<double,2>::convert_int() const -> intvec_t
-  {
-    return -(__vector long long)v;
-  }
-  
-  inline
-  auto boolvec<double,2>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  inline
-  auto boolvec<double,2>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline auto intvec<double,2>::as_float() const -> realvec_t
-  {
-    return (__vector double)v;
-  }
-  
-  inline auto intvec<double,2>::convert_float() const -> realvec_t
-  {
-    // return vec_ctd(v, 0);
-    return MF::vml_convert_float(*this);
-  }
-  
-} // namespace vecmathlib
-
-#endif  // #ifndef VEC_DOUBLE_VSX_H
diff --git a/vec_float_altivec.h b/vec_float_altivec.h
deleted file mode 100644
index 1fdcbb4..0000000
--- a/vec_float_altivec.h
+++ /dev/null
@@ -1,553 +0,0 @@
-// -*-C++-*-
-
-#ifndef VEC_FLOAT_ALTIVEC_H
-#define VEC_FLOAT_ALTIVEC_H
-
-#include "floatprops.h"
-#include "mathfuncs.h"
-#include "vec_base.h"
-
-#include <cmath>
-
-// Altivec intrinsics
-#include <altivec.h>
-#undef vector
-#undef pixel
-#undef bool
-
-
-
-namespace vecmathlib {
-  
-#define VECMATHLIB_HAVE_VEC_FLOAT_4
-  template<> struct boolvec<float,4>;
-  template<> struct intvec<float,4>;
-  template<> struct realvec<float,4>;
-  
-  
-  
-  template<>
-  struct boolvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef __vector __bool int bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values are -1, false values are 0
-    static uint_t from_bool(bool a) { return -int_t(a); }
-    static bool to_bool(uint_t a) { return a; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(vec_splats(from_bool(a))) {}
-    boolvec(bool const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
-    boolvec& set_elt(int n, bool a)
-    {
-      return ((uint_t*)&v)[n]=from_bool(a), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vec_nor(v, v); }
-    
-    boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
-    boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
-    // boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator==(boolvec x) const; // defined after intvec
-    boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
-    
-    bool all() const { return vec_all_ne(v, BV(false).v); }
-    bool any() const { return vec_any_ne(v, BV(false).v); }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef __vector int ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vec_splats(a)) {}
-    intvec(int_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota() { return (__vector int){0, 1, 2, 3}; }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
-    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return (__vector __bool int)v; }
-    boolvec_t convert_bool() const { return *this != IV(0); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(0) - *this; }
-    
-    intvec operator+(intvec x) const { return vec_add(v, x.v); }
-    intvec operator-(intvec x) const { return vec_sub(v, x.v); }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return vec_nor(v, v); }
-    
-    intvec operator&(intvec x) const { return vec_and(v, x.v); }
-    intvec operator|(intvec x) const { return vec_or(v, x.v); }
-    intvec operator^(intvec x) const { return vec_xor(v, x.v); }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const { return lsr(IV(n)); }
-    intvec operator>>(int_t n) const { return *this >> IV(n); }
-    intvec operator<<(int_t n) const { return *this << IV(n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      return vec_sr(v, (__vector unsigned int)n.v);
-    }
-    intvec operator>>(intvec n) const
-    {
-      return vec_sra(v, (__vector unsigned int)n.v);
-    }
-    intvec operator<<(intvec n) const
-    {
-      return vec_sl(v, (__vector unsigned int)n.v);
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t signbit() const
-    {
-      return *this < IV(I(0));
-    }
-    
-    boolvec_t operator==(intvec const& x) const { return vec_cmpeq(v, x.v); }
-    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(intvec const& x) const { return vec_cmplt(v, x.v); }
-    boolvec_t operator<=(intvec const& x) const { return !(*this > x); }
-    boolvec_t operator>(intvec const& x) const { return vec_cmpgt(v, x.v); }
-    boolvec_t operator>=(intvec const& x) const { return !(*this < x); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef __vector float vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<Altivec:4*float>"; }
-    void barrier() { __asm__("": "+v" (v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vec_splats(a)) {}
-    realvec(real_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
-    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vec_ld(0, p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      realvec_t v0 = vec_ld(0, p);
-      realvec_t v1 = vec_ld(15, p);
-      return vec_perm(v0.v, v1.v, vec_lvsl(0, p));
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vec_st(v, 0, p);
-    }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
-      // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
-      p[2] = (*this)[2];
-      p[3] = (*this)[3];
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-	// Use vec_ste?
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-	// Use vec_ste?
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return (__vector int) v; }
-    intvec_t convert_int() const { return vec_cts(v, 0); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return RV(0.0) - *this; }
-    
-    realvec operator+(realvec x) const { return vec_add(v, x.v); }
-    realvec operator-(realvec x) const { return vec_sub(v, x.v); }
-    realvec operator*(realvec x) const {
-#if defined __VSX__
-      return vec_mul(v, x.v);
-#else
-      return vec_madd(v, x.v, RV(0.0).v);
-#endif
-    }
-    realvec operator/(realvec x) const {
-#if defined __VSX__
-      return vec_div(v, x.v);
-#else
-      return *this * x.rcp();
-#endif
-    }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-    }
-    real_t sum() const
-    {
-      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); }
-    boolvec_t operator!=(realvec const& x) const { return ! (*this == x); }
-    boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); }
-    boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); }
-    boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); }
-    boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const { return vec_ceil(v); }
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return vec_abs(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return vec_floor(v); }
-    realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
-    realvec fmax(realvec y) const { return vec_max(v, y.v); }
-    realvec fmin(realvec y) const { return vec_min(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec x = *this;
-      realvec r = vec_re(v);    // this is only an approximation
-      // TODO: use fma
-      // Note: don't rewrite this expression, this may introduce
-      // cancellation errors
-      r += r * (RV(1.0) - x*r); // one Newton iteration (see vml_rcp)
-      return r;
-    }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const { return vec_round(v); }
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const
-    {
-#if defined __VSX__
-      return vec_rsqrt(v);
-#else
-      realvec x = *this;
-      realvec r = vec_rsqrte(x.v); // this is only an approximation
-      // TODO: use fma
-      // one Newton iteration (see vml_rsqrt)
-      r += RV(0.5)*r * (RV(1.0) - x * r*r);
-      return r;
-#endif
-    }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const {
-#if defined __VSX__
-      return vec_sqrt(v);
-#else
-      return *this * rsqrt();
-#endif
-    }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const { return vec_trunc(v); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline
-  auto boolvec<float,4>::as_int() const -> intvec_t
-  {
-    return (__vector int) v;
-  }
-  
-  inline
-  auto boolvec<float,4>::convert_int() const -> intvec_t
-  {
-    return -(__vector int)v;
-  }
-  
-  inline
-  auto boolvec<float,4>::operator==(boolvec x) const -> boolvec_t
-  {
-    return as_int() == x.as_int();
-  }
-  
-  inline
-  auto boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  inline
-  auto boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline auto intvec<float,4>::as_float() const -> realvec_t
-  {
-    return (__vector float)v;
-  }
-  
-  inline auto intvec<float,4>::convert_float() const -> realvec_t
-  {
-    return vec_ctf(v, 0);
-  }
-  
-} // namespace vecmathlib
-
-#endif  // #ifndef VEC_FLOAT_ALTIVEC_H
diff --git a/vec_float_avx.h b/vec_float_avx.h
deleted file mode 100644
index 6c4f1ee..0000000
--- a/vec_float_avx.h
+++ /dev/null
@@ -1,646 +0,0 @@
-// -*-C++-*-
-
-#ifndef VEC_FLOAT_AVX_H
-#define VEC_FLOAT_AVX_H
-
-#include "floatprops.h"
-#include "mathfuncs.h"
-#include "vec_base.h"
-
-#include <cmath>
-
-// AVX intrinsics
-#include <immintrin.h>
-
-
-
-namespace vecmathlib {
-  
-#define VECMATHLIB_HAVE_VEC_FLOAT_8
-  template<> struct boolvec<float,8>;
-  template<> struct intvec<float,8>;
-  template<> struct realvec<float,8>;
-  
-  
-  
-  template<>
-  struct boolvec<float,8>: floatprops<float>
-  {
-    static int const size = 8;
-    typedef bool scalar_t;
-    typedef __m256 bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a):
-    v(_mm256_castsi256_ps(_mm256_set1_epi32(from_bool(a)))) {}
-    boolvec(bool const* as):
-    v(_mm256_castsi256_ps(_mm256_set_epi32(from_bool(as[7]),
-                                           from_bool(as[6]),
-                                           from_bool(as[5]),
-                                           from_bool(as[4]),
-                                           from_bool(as[3]),
-                                           from_bool(as[2]),
-                                           from_bool(as[1]),
-                                           from_bool(as[0])))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
-    boolvec& set_elt(int n, bool a)
-    {
-      return ((uint_t*)&v)[n]=from_bool(a), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return _mm256_xor_ps(boolvec(true), v); }
-    
-    boolvec operator&&(boolvec x) const { return _mm256_and_ps(v, x.v); }
-    boolvec operator||(boolvec x) const { return _mm256_or_ps(v, x.v); }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const { return _mm256_xor_ps(v, x.v); }
-    
-    bool all() const
-    {
-      // return
-      //   (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] &&
-      //   (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7];
-      return ! (! *this).any();
-    }
-    bool any() const
-    {
-      // return
-      //   (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] ||
-      //   (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7];
-      return ! _mm256_testz_ps(v, v);
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,8>: floatprops<float>
-  {
-    static int const size = 8;
-    typedef int_t scalar_t;
-    typedef __m256i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm256_set1_epi32(a)) {}
-    intvec(int_t const* as): v(_mm256_set_epi32(as[7], as[6], as[5], as[4],
-                                                as[3], as[2], as[1], as[0])) {}
-    static intvec iota() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
-    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return _mm256_castsi256_ps(v); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare with zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec x = *this;
-      // We know that boolvec values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(0) - *this; }
-    
-    intvec operator+(intvec x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_add_epi32(vlo, xvlo);
-      vhi = _mm_add_epi32(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator-(intvec x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_sub_epi32(vlo, xvlo);
-      vhi = _mm_sub_epi32(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec operator&(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    intvec operator|(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
-    }
-    intvec operator^(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srli_epi32(vlo, n);
-      vhi = _mm_srli_epi32(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator>>(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srai_epi32(vlo, n);
-      vhi = _mm_srai_epi32(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator<<(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_slli_epi32(vlo, n);
-      vhi = _mm_slli_epi32(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec operator>>(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t signbit() const
-    {
-      return as_bool();
-    }
-    
-    boolvec_t operator==(intvec const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
-    boolvec_t operator<(intvec const& x) const
-    {
-      // return (*this - x).as_bool();
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-    }
-    boolvec_t operator<=(intvec const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec const& x) const
-    {
-      return ! (*this < x);
-    }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,8>: floatprops<float>
-  {
-    static int const size = 8;
-    typedef real_t scalar_t;
-    typedef __m256 vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<AVX:8*float>"; }
-    void barrier() { __asm__("": "+x" (v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm256_set1_ps(a)) {}
-    realvec(real_t const* as): v(_mm256_set_ps(as[7], as[6], as[5], as[4],
-                                               as[3], as[2], as[1], as[0])) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
-    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm256_load_ps(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm256_loadu_ps(p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm256_store_ps(p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm256_storeu_ps(p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        _mm256_maskstore_ps(p, m.m.as_int(), v);
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return _mm256_castps_si256(v); }
-    intvec_t convert_int() const { return _mm256_cvttps_epi32(v); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return RV(0.0) - *this; }
-    
-    realvec operator+(realvec x) const { return _mm256_add_ps(v, x.v); }
-    realvec operator-(realvec x) const { return _mm256_sub_ps(v, x.v); }
-    realvec operator*(realvec x) const { return _mm256_mul_ps(v, x.v); }
-    realvec operator/(realvec x) const { return _mm256_div_ps(v, x.v); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t prod() const
-    {
-      return
-        (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] *
-        (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7];
-    }
-    real_t sum() const
-    {
-      // return
-      //   (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3] +
-      //   (*this)[4] + (*this)[5] + (*this)[6] + (*this)[7];
-      // _m256 x = vhaddps(v, v);
-      // x = vhaddps(x, x);
-      // __m128 xlo = _mm256_extractf128_ps(x, 0);
-      // __m128 xhi = _mm256_extractf128_ps(x, 1);
-      // return _mm_cvtsd_f64(xlo) + _mm_cvtsd_f64(xhi);
-      realvec x = *this;
-      x = _mm256_hadd_ps(x.v, x.v);
-      x = _mm256_hadd_ps(x.v, x.v);
-      return x[0] + x[4];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_EQ_OQ);
-    }
-    boolvec_t operator!=(realvec const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_NEQ_OQ);
-    }
-    boolvec_t operator<(realvec const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_LT_OQ);
-    }
-    boolvec_t operator<=(realvec const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_LE_OQ);
-    }
-    boolvec_t operator>(realvec const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_GT_OQ);
-    }
-    boolvec_t operator>=(realvec const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_GE_OQ);
-    }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const { return _mm256_ceil_ps(v); }
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return MF::vml_fabs(*this); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return _mm256_floor_ps(v); }
-    realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); }
-    realvec fmax(realvec y) const { return _mm256_max_ps(v, y.v); }
-    realvec fmin(realvec y) const { return _mm256_min_ps(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return _mm256_cmp_ps(v, v, _CMP_UNORD_Q); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec x = *this;
-      realvec r = _mm256_rcp_ps(x); // this is only an approximation
-      r *= RV(2.0) - r*x;        // one Newton iteration (see vml_rcp)
-      return r;
-    }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const
-    {
-      return _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
-    }
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const
-    {
-      realvec x = *this;
-      realvec r = _mm256_rsqrt_ps(x);    // this is only an approximation
-      r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt)
-      return r;
-    }
-    boolvec_t signbit() const { return v; }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return _mm256_sqrt_ps(v); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const { return _mm256_round_ps(v, _MM_FROUND_TO_ZERO); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline
-  auto boolvec<float,8>::as_int() const -> intvec_t
-  {
-    return _mm256_castps_si256(v);
-  }
-  
-  inline
-  auto boolvec<float,8>::convert_int() const -> intvec_t
-  {
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  auto boolvec<float,8>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  auto boolvec<float,8>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
-  {
-    return _mm256_blendv_ps(y.v, x.v, v);
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline auto intvec<float,8>::as_float() const -> realvec_t
-  {
-    return _mm256_castsi256_ps(v);
-  }
-  
-  inline auto intvec<float,8>::convert_float() const -> realvec_t
-  {
-    return _mm256_cvtepi32_ps(v);
-  }
-  
-} // namespace vecmathlib
-
-#endif  // #ifndef VEC_FLOAT_AVX_H
diff --git a/vec_float_neon.h b/vec_float_neon.h
deleted file mode 100644
index fccc10f..0000000
--- a/vec_float_neon.h
+++ /dev/null
@@ -1,558 +0,0 @@
-// -*-C++-*-
-
-// <http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html>
-
-#ifndef VEC_FLOAT_NEON_H
-#define VEC_FLOAT_NEON_H
-
-#include "floatprops.h"
-#include "mathfuncs.h"
-#include "vec_base.h"
-
-#include <cmath>
-
-// Neon intrinsics
-#include <arm_neon.h>
-
-
-
-namespace vecmathlib {
-  
-#define VECMATHLIB_HAVE_VEC_FLOAT_2
-  template<> struct boolvec<float,2>;
-  template<> struct intvec<float,2>;
-  template<> struct realvec<float,2>;
-  
-  
-  
-  template<>
-  struct boolvec<float,2>: floatprops<float>
-  {
-    static int const size = 2;
-    typedef bool scalar_t;
-    typedef uint32x2_t bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values are -1, false values are 0
-    static uint_t from_bool(bool a) { return -int_t(a); }
-    static bool to_bool(uint_t a) { return a; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(vdup_n_u32(from_bool(a))) {}
-    boolvec(bool const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
-    boolvec& set_elt(int n, bool a)
-    {
-      return ((uint_t*)&v)[n]=from_bool(a), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vmvn_u32(v); }
-    
-    boolvec operator&&(boolvec x) const { return vand_u32(v, x.v); }
-    boolvec operator||(boolvec x) const { return vorr_u32(v, x.v); }
-    boolvec operator==(boolvec x) const { return vceq_u32(v, x.v); }
-    boolvec operator!=(boolvec x) const { return veor_u32(v, x.v); }
-    
-    bool all() const
-    {
-      boolvec r = vpmin_u32(v, v);
-      return to_bool(r[0]);
-    }
-    bool any() const
-    {
-      boolvec r = vpmax_u32(v, v);
-      return to_bool(r[0]);
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,2>: floatprops<float>
-  {
-    static int const size = 2;
-    typedef int_t scalar_t;
-    typedef int32x2_t ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vdup_n_s32(a)) {}
-    intvec(int_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota()
-    {
-      return vcreate_s32((uint64_t(0) << uint64_t(32)) | uint64_t(1));
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
-    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return vreinterpret_u32_s32(v); }
-    boolvec_t convert_bool() const { return *this != IV(0); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return vneg_s32(v); }
-    
-    intvec operator+(intvec x) const { return vadd_s32(v, x.v); }
-    intvec operator-(intvec x) const { return vsub_s32(v, x.v); }
-    intvec operator*(intvec x) const { return vmul_s32(v, x.v); }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    intvec& operator*=(intvec const& x) { return *this=*this*x; }
-    
-    
-    
-    intvec operator~() const { return vmvn_s32(v); }
-    
-    intvec operator&(intvec x) const { return vand_s32(v, x.v); }
-    intvec operator|(intvec x) const { return vorr_s32(v, x.v); }
-    intvec operator^(intvec x) const { return veor_s32(v, x.v); }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const { return lsr(IV(n)); }
-    intvec operator>>(int_t n) const { return *this >> IV(n); }
-    intvec operator<<(int_t n) const { return *this << IV(n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      return vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v), (-n).v));
-    }
-    intvec operator>>(intvec n) const
-    {
-      return vshl_s32(v, (-n).v);
-    }
-    intvec operator<<(intvec n) const
-    {
-      return vshl_s32(v, n.v);
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t signbit() const
-    {
-      //return *this < IV(I(0));
-      return intvec(vshr_n_s32(v, FP::bits-1)).as_bool();
-    }
-    
-    boolvec_t operator==(intvec const& x) const { return vceq_s32(v, x.v); }
-    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(intvec const& x) const { return vclt_s32(v, x.v); }
-    boolvec_t operator<=(intvec const& x) const { return vcle_s32(v, x.v); }
-    boolvec_t operator>(intvec const& x) const { return vcgt_s32(v, x.v); }
-    boolvec_t operator>=(intvec const& x) const { return vcge_s32(v, x.v); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,2>: floatprops<float>
-  {
-    static int const size = 2;
-    typedef real_t scalar_t;
-    typedef float32x2_t vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<NEON:2*float>"; }
-    void barrier() { __asm__("": "+w" (v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vdup_n_f32(a)) {}
-    realvec(real_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
-    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vld1_f32(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-#if defined __ARM_FEATURE_UNALIGNED
-      return vld1_f32(p);
-#else
-#  error "unaligned NEON loads not implemented"
-#endif
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vst1_f32(p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
-      // p[0] = (*this)[0];
-      // p[1] = (*this)[1];
-#if defined __ARM_FEATURE_UNALIGNED
-      vst1_f32(p, v);
-#else
-#  error "unaligned NEON stores not implemented"
-#endif
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return vreinterpret_s32_f32(v); }
-    intvec_t convert_int() const { return vcvt_s32_f32(v); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return vneg_f32(v); }
-    
-    realvec operator+(realvec x) const { return vadd_f32(v, x.v); }
-    realvec operator-(realvec x) const { return vsub_f32(v, x.v); }
-    realvec operator*(realvec x) const { return vmul_f32(v, x.v); }
-    realvec operator/(realvec x) const { return *this * x.rcp(); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1];
-    }
-    real_t sum() const
-    {
-      realvec r = vpadd_f32(v, v);
-      return r[0];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return vceq_f32(v, x.v); }
-    boolvec_t operator!=(realvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(realvec const& x) const { return vclt_f32(v, x.v); }
-    boolvec_t operator<=(realvec const& x) const { return vcle_f32(v, x.v); }
-    boolvec_t operator>(realvec const& x) const { return vcgt_f32(v, x.v); }
-    boolvec_t operator>=(realvec const& x) const { return vcge_f32(v, x.v); }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const
-    {
-      // return vrndp_f32(v);
-      return MF::vml_ceil(*this);
-    }
-    realvec copysign(realvec y) const
-    {
-      return vbsl_f32(vdup_n_u32(FP::signbit_mask), y.v, v);
-    }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return vabs_f32(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const
-    {
-      // return vrndm_f32(v);
-      return MF::vml_floor(*this);
-    }
-    realvec fma(realvec y, realvec z) const
-    {
-      // TODO: vfma_f32
-      return vmla_f32(z.v, v, y.v);
-    }
-    realvec fmax(realvec y) const { return vmax_f32(v, y.v); }
-    realvec fmin(realvec y) const { return vmin_f32(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec r = vrecpe_f32(v);
-      r *= vrecps_f32(v, r);
-      r *= vrecps_f32(v, r);
-      return r;
-    }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const
-    {
-      // return vrndn_f32(v);
-      return MF::vml_rint(*this);
-    }
-    realvec round() const
-    {
-      // return vrnda_f32(v);
-      return MF::vml_round(*this);
-    }
-    realvec rsqrt() const
-    {
-      realvec r = vrsqrte_f32(v);
-      r *= vrsqrts_f32(v, r*r);
-      r *= vrsqrts_f32(v, r*r);
-      return r;
-    }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return *this * rsqrt(); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const
-    {
-      // return vrnd_f32(v);
-      return MF::vml_trunc(*this);
-    }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline
-  auto boolvec<float,2>::as_int() const -> intvec_t
-  {
-    return vreinterpret_s32_u32(v);
-  }
-  
-  inline
-  auto boolvec<float,2>::convert_int() const -> intvec_t
-  {
-    return - as_int();
-  }
-  
-  inline
-  auto boolvec<float,2>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
-  {
-    return vbsl_s32(v, x.v, y.v);
-  }
-  
-  inline
-  auto boolvec<float,2>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
-  {
-    return vbsl_f32(v, x.v, y.v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline auto intvec<float,2>::as_float() const -> realvec_t
-  {
-    return vreinterpret_f32_s32(v);
-  }
-  
-  inline auto intvec<float,2>::convert_float() const -> realvec_t
-  {
-    return vcvt_f32_s32(v);
-  }
-  
-} // namespace vecmathlib
-
-#endif  // #ifndef VEC_FLOAT_NEON_H
diff --git a/vec_float_sse2.h b/vec_float_sse2.h
deleted file mode 100644
index c87ae02..0000000
--- a/vec_float_sse2.h
+++ /dev/null
@@ -1,651 +0,0 @@
-// -*-C++-*-
-
-#ifndef VEC_FLOAT_SSE2_H
-#define VEC_FLOAT_SSE2_H
-
-#include "floatprops.h"
-#include "mathfuncs.h"
-#include "vec_base.h"
-
-#include <cmath>
-
-// SSE2 intrinsics
-#include <xmmintrin.h>
-#ifdef __SSE3__                 // Intel's SSE 3
-#  include <pmmintrin.h>
-#endif
-#if defined __SSE4_1__          // Intel's SSE 4.1
-#  include <smmintrin.h>
-#endif
-#if defined __SSE4A__           // AMD's SSE 4a
-#  include <ammintrin.h>
-#endif
-#if defined __AVX__             // Intel's AVX
-#  include <immintrin.h>
-#endif
-
-
-
-namespace vecmathlib {
-  
-#define VECMATHLIB_HAVE_VEC_FLOAT_4
-  template<> struct boolvec<float,4>;
-  template<> struct intvec<float,4>;
-  template<> struct realvec<float,4>;
-  
-  
-  
-  template<>
-  struct boolvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef __m128 bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - int_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a):
-    v(_mm_castsi128_ps(_mm_set1_epi32(from_bool(a)))) {}
-    boolvec(bool const* as):
-    v(_mm_castsi128_ps(_mm_set_epi32(from_bool(as[3]),
-                                     from_bool(as[2]),
-                                     from_bool(as[1]),
-                                     from_bool(as[0])))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
-    boolvec& set_elt(int n, bool a)
-    {
-      return ((uint_t*)&v)[n]=from_bool(a), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return _mm_xor_ps(boolvec(true), v); }
-    
-    boolvec operator&&(boolvec x) const { return _mm_and_ps(v, x.v); }
-    boolvec operator||(boolvec x) const { return _mm_or_ps(v, x.v); }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const { return _mm_xor_ps(v, x.v); }
-    
-    bool all() const
-    {
-      // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
-#if defined __AVX__
-      return ! (! *this).any();
-#else
-      boolvec x = *this;
-      x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(1,0,3,2));
-      x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1));
-      return x[0];
-#endif
-    }
-    bool any() const
-    {
-      // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
-#if defined __AVX__
-      return ! _mm_testz_ps(v, v);
-#else
-      boolvec x = *this;
-      x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(1,0,3,2));
-      x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1));
-      return x[0];
-#endif
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef __m128i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm_set1_epi32(a)) {}
-    intvec(int_t const* as): v(_mm_set_epi32(as[3], as[2], as[1], as[0])) {}
-    static intvec iota() { return _mm_set_epi32(3, 2, 1, 0); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
-    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return _mm_castsi128_ps(v); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      return ! IV(_mm_cmpeq_epi32(v, IV(0))).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(0) - *this; }
-    
-    intvec operator+(intvec x) const { return _mm_add_epi32(v, x.v); }
-    intvec operator-(intvec x) const { return _mm_sub_epi32(v, x.v); }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec operator&(intvec x) const
-    {
-      return _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v),
-                                         _mm_castsi128_ps(x.v)));
-    }
-    intvec operator|(intvec x) const
-    {
-      return _mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(v),
-                                        _mm_castsi128_ps(x.v)));
-    }
-    intvec operator^(intvec x) const
-    {
-      return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(v),
-                                         _mm_castsi128_ps(x.v)));
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const { return _mm_srli_epi32(v, n); }
-    intvec operator>>(int_t n) const { return _mm_srai_epi32(v, n); }
-    intvec operator<<(int_t n) const { return _mm_slli_epi32(v, n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec operator>>(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t signbit() const
-    {
-      return as_bool();
-    }
-    
-    boolvec_t operator==(intvec const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
-    boolvec_t operator<(intvec const& x) const
-    {
-      // return (*this - x).as_bool();
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-    }
-    boolvec_t operator<=(intvec const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec const& x) const
-    {
-      return ! (*this < x);
-    }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef __m128 vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<SSE2:4*float>"; }
-    void barrier() { __asm__("": "+x" (v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm_set1_ps(a)) {}
-    realvec(real_t const* as): v(_mm_set_ps(as[3], as[2], as[1], as[0])) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
-    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm_load_ps(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm_loadu_ps(p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      if (ioff==0) return loada(p);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm_store_ps(p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm_storeu_ps(p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-#if defined __AVX__
-        _mm_maskstore_ps(p, m.m.as_int(), v);
-#else
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-#endif
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return _mm_castps_si128(v); }
-    intvec_t convert_int() const { return _mm_cvttps_epi32(v); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return RV(0.0) - *this; }
-    
-    realvec operator+(realvec x) const { return _mm_add_ps(v, x.v); }
-    realvec operator-(realvec x) const { return _mm_sub_ps(v, x.v); }
-    realvec operator*(realvec x) const { return _mm_mul_ps(v, x.v); }
-    realvec operator/(realvec x) const { return _mm_div_ps(v, x.v); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-    }
-    real_t sum() const
-    {
-#ifdef __SSE3__
-      realvec x = *this;
-      x = _mm_hadd_ps(x.v, x.v);
-      x = _mm_hadd_ps(x.v, x.v);
-      return x[0];
-#else
-      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-#endif
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const
-    {
-      return _mm_cmpeq_ps(v, x.v);
-    }
-    boolvec_t operator!=(realvec const& x) const
-    {
-      return _mm_cmpneq_ps(v, x.v);
-    }
-    boolvec_t operator<(realvec const& x) const
-    {
-      return _mm_cmplt_ps(v, x.v);
-    }
-    boolvec_t operator<=(realvec const& x) const
-    {
-      return _mm_cmple_ps(v, x.v);
-    }
-    boolvec_t operator>(realvec const& x) const
-    {
-      return _mm_cmpgt_ps(v, x.v);
-    }
-    boolvec_t operator>=(realvec const& x) const
-    {
-      return _mm_cmpge_ps(v, x.v);
-    }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const
-    {
-#ifdef __SSE4_1__
-      return _mm_ceil_ps(v);
-#else
-      return MF::vml_ceil(*this);
-#endif
-    }
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return MF::vml_fabs(*this); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const
-    {
-#ifdef __SSE4_1__
-      return _mm_floor_ps(v);
-#else
-      return MF::vml_floor(*this);
-#endif
-    }
-    realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); }
-    realvec fmax(realvec y) const { return _mm_max_ps(v, y.v); }
-    realvec fmin(realvec y) const { return _mm_min_ps(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return _mm_cmpunord_ps(v, v);; }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec x = *this;
-      realvec r = _mm_rcp_ps(x); // this is only an approximation
-      r *= RV(2.0) - r*x;        // one Newton iteration (see vml_rcp)
-      return r;
-    }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const
-    {
-#ifdef __SSE4_1__
-      return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
-#else
-      return MF::vml_rint(*this);
-#endif
-    }
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const
-    {
-      realvec x = *this;
-      realvec r = _mm_rsqrt_ps(x);    // this is only an approximation
-      r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt)
-      return r;
-    }
-    boolvec_t signbit() const { return v; }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return _mm_sqrt_ps(v); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const
-    {
-#ifdef __SSE4_1__
-      return _mm_round_ps(v, _MM_FROUND_TO_ZERO);
-#else
-      return MF::vml_trunc(*this);
-#endif
-    }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline
-  auto boolvec<float,4>::as_int() const -> intvec_t
-  {
-    return _mm_castps_si128(v);
-  }
-  
-  inline
-  auto boolvec<float,4>::convert_int() const -> intvec_t
-  {
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  auto boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  auto boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
-  {
-#ifdef __SSE4_1__
-    return _mm_blendv_ps(y.v, x.v, v);
-#else
-    return (( -convert_int() & x.as_int()) |
-            (~-convert_int() & y.as_int())).as_float();
-#endif
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline auto intvec<float,4>::as_float() const -> realvec_t
-  {
-    return _mm_castsi128_ps(v);
-  }
-  
-  inline auto intvec<float,4>::convert_float() const -> realvec_t
-  {
-    return _mm_cvtepi32_ps(v);
-  }
-  
-} // namespace vecmathlib
-
-#endif  // #ifndef VEC_FLOAT_SSE2_H
diff --git a/vec_float_sse2_scalar.h b/vec_float_sse2_scalar.h
deleted file mode 100644
index 8dcb5b4..0000000
--- a/vec_float_sse2_scalar.h
+++ /dev/null
@@ -1,523 +0,0 @@
-// -*-C++-*-
-
-#ifndef VEC_FLOAT_SSE2_SCALAR_H
-#define VEC_FLOAT_SSE2_SCALAR_H
-
-#include "floatprops.h"
-#include "mathfuncs.h"
-#include "vec_base.h"
-
-#include <cmath>
-
-// SSE2 intrinsics
-#include <emmintrin.h>
-#ifdef __SSE3__                 // Intel's SSE 3
-#  include <pmmintrin.h>
-#endif
-#ifdef __SSE4_1__               // Intel's SSE 4.1
-#  include <smmintrin.h>
-#endif
-#ifdef __SSE4A__                // AMD's SSE 4a
-#  include <ammintrin.h>
-#endif
-#if defined __AVX__             // Intel's AVX
-#  include <immintrin.h>
-#endif
-
-
-
-namespace vecmathlib {
-  
-#define VECMATHLIB_HAVE_VEC_FLOAT_1
-  template<> struct boolvec<float,1>;
-  template<> struct intvec<float,1>;
-  template<> struct realvec<float,1>;
-  
-  
-  
-  template<>
-  struct boolvec<float,1>: floatprops<float>
-  {
-    static int const size = 1;
-    typedef bool scalar_t;
-    typedef uint_t bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-    // true values are non-zero, false values are zero
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(a) {}
-    // TODO: remove this
-    boolvec(int x): v(x) {}
-    boolvec(bool const* as): v(as[0]) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return v; }
-    boolvec& set_elt(int n, bool a) { return v=a, *this; }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return !v; }
-    
-    boolvec operator&&(boolvec x) const { return v && x.v; }
-    boolvec operator||(boolvec x) const { return v || x.v; }
-    boolvec operator==(boolvec x) const { return bool(v) == bool(x.v); }
-    boolvec operator!=(boolvec x) const { return bool(v) != bool(x.v); }
-    
-    bool all() const { return v; }
-    bool any() const { return v; }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,1>: floatprops<float>
-  {
-    static int const size = 1;
-    typedef int_t scalar_t;
-    typedef int_t ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(int_t a): v(a) {}
-    intvec(int_t const* as): v(as[0]) {}
-    static intvec iota() { return intvec(I(0)); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return v; }
-    intvec& set_elt(int n, int_t a) { return v=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return U(v); }
-    boolvec_t convert_bool() const { return bool(v); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return +v; }
-    intvec operator-() const { return -v; }
-    
-    intvec operator+(intvec x) const { return v+x.v; }
-    intvec operator-(intvec x) const { return v-x.v; }
-    intvec operator*(intvec x) const { return v*x.v; }
-    intvec operator/(intvec x) const { return v/x.v; }
-    intvec operator%(intvec x) const { return v%x.v; }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    intvec& operator*=(intvec const& x) { return *this=*this*x; }
-    intvec& operator/=(intvec const& x) { return *this=*this/x; }
-    intvec& operator%=(intvec const& x) { return *this=*this%x; }
-    
-    
-    
-    intvec operator~() const { return ~v; }
-    
-    intvec operator&(intvec x) const { return v&x.v; }
-    intvec operator|(intvec x) const { return v|x.v; }
-    intvec operator^(intvec x) const { return v^x.v; }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const { return U(v) >> U(n); }
-    intvec operator>>(int_t n) const { return v>>n; }
-    intvec operator<<(int_t n) const { return v<<n; }
-    
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const { return U(v) >> U(n); }
-    intvec operator>>(intvec n) const { return v>>n; }
-    intvec operator<<(intvec n) const { return v<<n; }
-    
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t signbit() const
-    {
-      return *this < IV(I(0));
-    }
-    
-    boolvec_t operator==(intvec const& x) const { return v==x.v; }
-    boolvec_t operator!=(intvec const& x) const { return v!=x.v; }
-    boolvec_t operator<(intvec const& x) const { return v<x.v; }
-    boolvec_t operator<=(intvec const& x) const { return v<=x.v; }
-    boolvec_t operator>(intvec const& x) const { return v>x.v; }
-    boolvec_t operator>=(intvec const& x) const { return v>=x.v; }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,1>: floatprops<float>
-  {
-    static int const size = 1;
-    typedef real_t scalar_t;
-    typedef float vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<SSE2:1*float>"; }
-    void barrier() { __asm__("": "+x" (v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-  private:
-    static __m128 from_float(float a) { return _mm_set_ss(a); }
-    static float to_float(__m128 a) { return _mm_cvtss_f32(a); }
-  public:
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(real_t a): v(a) {}
-    realvec(real_t const* as): v(as[0]) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return v; }
-    realvec& set_elt(int n, real_t a) { return v=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return *p;
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return *p;
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loada(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return *this;
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return *this;
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loada(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      *p = v;
-    }
-    void storeu(real_t* p) const
-    {
-      *p = v;
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storea(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storea(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return floatprops::as_int(v); }
-    intvec_t convert_int() const {
-      // return floatprops::convert_int(v);
-      return _mm_cvttss_si32(_mm_set_ss(v));
-    }
-    
-    
-    
-    realvec operator+() const { return +v; }
-    realvec operator-() const { return -v; }
-    
-    realvec operator+(realvec x) const { return v+x.v; }
-    realvec operator-(realvec x) const { return v-x.v; }
-    realvec operator*(realvec x) const { return v*x.v; }
-    realvec operator/(realvec x) const { return v/x.v; }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t prod() const { return v; }
-    real_t sum() const { return v; }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return v==x.v; }
-    boolvec_t operator!=(realvec const& x) const { return v!=x.v; }
-    boolvec_t operator<(realvec const& x) const { return v<x.v; }
-    boolvec_t operator<=(realvec const& x) const { return v<=x.v; }
-    boolvec_t operator>(realvec const& x) const { return v>x.v; }
-    boolvec_t operator>=(realvec const& x) const { return v>=x.v; }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const
-    {
-#ifdef __SSE4_1__
-      return to_float(_mm_ceil_ss(from_float(v), from_float(v)));
-#else
-      return std::ceil(v);
-#endif
-    }
-    realvec copysign(realvec y) const { return std::copysign(v, y.v); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return std::fabs(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const
-    {
-#ifdef __SSE4_1__
-      return to_float(_mm_floor_ss(from_float(v), from_float(v)));
-#else
-      return std::floor(v);
-#endif
-    }
-    realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); }
-    realvec fmax(realvec y) const
-    {
-      return to_float(_mm_max_ss(from_float(v), from_float(y.v)));
-    }
-    realvec fmin(realvec y) const
-    {
-      return to_float(_mm_min_ss(from_float(v), from_float(y.v)));
-    }
-    realvec fmod(realvec y) const { return std::fmod(v, y.v); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const
-    {
-      int_t r = std::ilogb(v);
-      if (r == FP_ILOGB0) r = numeric_limits<int_t>::min();
-      else if (r == FP_ILOGBNAN) r = numeric_limits<int_t>::max();
-      return r;
-    }
-    boolvec_t isfinite() const { return std::isfinite(v); }
-    boolvec_t isinf() const { return std::isinf(v); }
-    boolvec_t isnan() const
-    {
-      return _mm_ucomineq_ss(from_float(v), from_float(v));
-    }
-    boolvec_t isnormal() const { return std::isnormal(v); }
-    realvec ldexp(int_t n) const { return std::ldexp(v, n); }
-    realvec ldexp(intvec_t n) const { return std::ldexp(v, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const { return R(1.0)/v; }
-    realvec remainder(realvec y) const { return std::remainder(v, y.v); }
-    realvec rint() const
-    {
-#ifdef __SSE4_1__
-      return to_float(_mm_round_ss(from_float(v), from_float(v),
-                                   _MM_FROUND_TO_NEAREST_INT));
-#else
-      return MF::vml_rint(*this);
-#endif
-    }
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return std::signbit(v); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    // realvec sqrt1() const { return std::sqrt(v); }
-    realvec sqrt() const { return to_float(_mm_sqrt_ss(from_float(v))); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const
-    {
-#ifdef __SSE4_1__
-      return to_float(_mm_round_ss(from_float(v), from_float(v),
-                                   _MM_FROUND_TO_ZERO));
-#else
-      return MF::vml_trunc(*this);
-#endif
-    }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline
-  auto boolvec<float,1>::as_int() const -> intvec_t
-  {
-    return I(v);
-  }
-  
-  inline
-  auto boolvec<float,1>::convert_int() const -> intvec_t
-  {
-    return v;
-  }
-  
-  inline
-  auto boolvec<float,1>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
-  {
-    return v ? x : y;
-  }
-  
-  inline
-  auto boolvec<float,1>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
-  {
-    return v ? x : y;
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline auto intvec<float,1>::as_float() const -> realvec_t
-  {
-    return FP::as_float(v);
-  }
-  
-  inline auto intvec<float,1>::convert_float() const -> realvec_t
-  {
-    // return FP::convert_float(v);
-    return _mm_cvtss_f32(_mm_cvtsi32_ss(_mm_setzero_ps(), v));
-  }
-  
-} // namespace vecmathlib
-
-#endif  // #ifndef VEC_FLOAT_SSE2_SCALAR_H
diff --git a/vec_fp16_avx.h b/vec_fp16_avx.h
deleted file mode 100644
index 5dc456c..0000000
--- a/vec_fp16_avx.h
+++ /dev/null
@@ -1,582 +0,0 @@
-// -*-C++-*-
-
-#ifndef VEC_FP16_AVX_H
-#define VEC_FP16_AVX_H
-
-#include "floatprops.h"
-#include "mathfuncs.h"
-#include "vec_base.h"
-
-#include <cmath>
-
-// AVX intrinsics
-#include <immintrin.h>
-
-
-
-namespace vecmathlib {
-  
-#define VECMATHLIB_HAVE_VEC_FP16_16
-  template<> struct boolvec<fp16,16>;
-  template<> struct intvec<fp16,16>;
-  template<> struct realvec<fp16,16>;
-  
-  
-  
-  template<>
-  struct boolvec<fp16,16>: floatprops<fp16>
-  {
-    static int const size = 16;
-    typedef bool scalar_t;
-    typedef __m256i bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(_mm256_set1_epi16(from_bool(a))) {}
-    boolvec(bool const* as):
-    v(_mm256_set_epi16(from_bool(as[15]),
-                       from_bool(as[14]),
-                       from_bool(as[13]),
-                       from_bool(as[12]),
-                       from_bool(as[11]),
-                       from_bool(as[10]),
-                       from_bool(as[ 9]),
-                       from_bool(as[ 8]),
-                       from_bool(as[ 7]),
-                       from_bool(as[ 6]),
-                       from_bool(as[ 5]),
-                       from_bool(as[ 4]),
-                       from_bool(as[ 3]),
-                       from_bool(as[ 2]),
-                       from_bool(as[ 1]),
-                       from_bool(as[ 0]))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
-    boolvec& set_elt(int n, bool a)
-    {
-      return ((uint_t*)&v)[n]=from_bool(a), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return *this != boolvec(true); }
-    
-    boolvec operator&&(boolvec x) const 
-    {
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    boolvec operator||(boolvec x) const
-    {
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
-    }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const
-    {
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    
-    bool all() const
-    {
-      bool r = true;
-      for (int n=0; n<size; ++n) r = r && (*this)[n];
-      return r;
-    }
-    bool any() const
-    {
-      bool r = false;
-      for (int n=0; n<size; ++n) r = r || (*this)[n];
-      return r;
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<fp16,16>: floatprops<fp16>
-  {
-    static int const size = 16;
-    typedef int_t scalar_t;
-    typedef __m256i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm256_set1_epi16(a)) {}
-    intvec(int_t const* as):
-    v(_mm256_set_epi16(as[15],
-                       as[14],
-                       as[13],
-                       as[12],
-                       as[11],
-                       as[10],
-                       as[ 9],
-                       as[ 8],
-                       as[ 7],
-                       as[ 6],
-                       as[ 5],
-                       as[ 4],
-                       as[ 3],
-                       as[ 2],
-                       as[ 1],
-                       as[ 0])) {}
-    static intvec iota()
-    {
-      return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8,
-                              7, 6, 5, 4, 3, 2, 1, 0);
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
-    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return v; }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare with zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec x = *this;
-      // We know that boolvec values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(I(0)) - *this; }
-    
-    intvec operator+(intvec x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_add_epi16(vlo, xvlo);
-      vhi = _mm_add_epi16(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator-(intvec x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_sub_epi16(vlo, xvlo);
-      vhi = _mm_sub_epi16(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec operator&(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    intvec operator|(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
-    }
-    intvec operator^(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srli_epi16(vlo, n);
-      vhi = _mm_srli_epi16(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator>>(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srai_epi16(vlo, n);
-      vhi = _mm_srai_epi16(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator<<(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_slli_epi16(vlo, n);
-      vhi = _mm_slli_epi16(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec operator>>(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
-  };
-  
-  
-  
-  template<>
-  struct realvec<fp16,16>: floatprops<fp16>
-  {
-    static int const size = 16;
-    typedef real_t scalar_t;
-    typedef __m256i vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<AVX:16*fp16>"; }
-    void barrier() { __asm__("": "+x" (v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm256_set1_epi16(FP::as_int(a))) {}
-    realvec(real_t const* as):
-    v(_mm256_set_epi16(FP::as_int(as[15]),
-                       FP::as_int(as[14]),
-                       FP::as_int(as[13]),
-                       FP::as_int(as[12]),
-                       FP::as_int(as[11]),
-                       FP::as_int(as[10]),
-                       FP::as_int(as[ 9]),
-                       FP::as_int(as[ 8]),
-                       FP::as_int(as[ 7]),
-                       FP::as_int(as[ 6]),
-                       FP::as_int(as[ 5]),
-                       FP::as_int(as[ 4]),
-                       FP::as_int(as[ 3]),
-                       FP::as_int(as[ 2]),
-                       FP::as_int(as[ 1]),
-                       FP::as_int(as[ 0]))) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
-    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm256_load_si256((__m256i const*)p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm256_loadu_si256((__m256i const*)p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm256_store_si256((__m256i*)p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm256_storeu_si256((__m256i*)p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return v; }
-    intvec_t convert_int() const { __builtin_unreachable(); }
-    
-    
-    
-    realvec operator+() const { __builtin_unreachable(); }
-    realvec operator-() const { __builtin_unreachable(); }
-    
-    realvec operator+(realvec x) const { __builtin_unreachable(); }
-    realvec operator-(realvec x) const { __builtin_unreachable(); }
-    realvec operator*(realvec x) const { __builtin_unreachable(); }
-    realvec operator/(realvec x) const { __builtin_unreachable(); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t prod() const { __builtin_unreachable(); }
-    real_t sum() const { __builtin_unreachable(); }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
-    
-    
-    
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec fabs() const { return MF::vml_fabs(*this); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    boolvec_t signbit() const { return v; }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline
-  auto boolvec<fp16,16>::as_int() const -> intvec_t
-  {
-    return v;
-  }
-  
-  inline
-  auto boolvec<fp16,16>::convert_int() const -> intvec_t
-  {
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  auto boolvec<fp16,16>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  auto boolvec<fp16,16>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
-  {
-    return (( -convert_int() & x.as_int()) |
-            (~-convert_int() & y.as_int())).as_float();
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline auto intvec<fp16,16>::as_float() const -> realvec_t
-  {
-    return v;
-  }
-  
-  inline auto intvec<fp16,16>::convert_float() const -> realvec_t
-  {
-    __builtin_unreachable();
-  }
-  
-} // namespace vecmathlib
-
-#endif  // #ifndef VEC_FP16_AVX_H
diff --git a/vec_fp8_avx.h b/vec_fp8_avx.h
deleted file mode 100644
index 16087d1..0000000
--- a/vec_fp8_avx.h
+++ /dev/null
@@ -1,648 +0,0 @@
-// -*-C++-*-
-
-#ifndef VEC_FP8_AVX_H
-#define VEC_FP8_AVX_H
-
-#include "floatprops.h"
-#include "mathfuncs.h"
-#include "vec_base.h"
-
-#include <cmath>
-
-// AVX intrinsics
-#include <immintrin.h>
-
-
-
-namespace vecmathlib {
-  
-#define VECMATHLIB_HAVE_VEC_FP8_32
-  template<> struct boolvec<fp8,32>;
-  template<> struct intvec<fp8,32>;
-  template<> struct realvec<fp8,32>;
-  
-  
-  
-  template<>
-  struct boolvec<fp8,32>: floatprops<fp8>
-  {
-    static int const size = 32;
-    typedef bool scalar_t;
-    typedef __m256i bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(_mm256_set1_epi8(from_bool(a))) {}
-    boolvec(bool const* as):
-    v(_mm256_set_epi8(from_bool(as[31]),
-                      from_bool(as[30]),
-                      from_bool(as[29]),
-                      from_bool(as[28]),
-                      from_bool(as[27]),
-                      from_bool(as[26]),
-                      from_bool(as[25]),
-                      from_bool(as[24]),
-                      from_bool(as[23]),
-                      from_bool(as[22]),
-                      from_bool(as[21]),
-                      from_bool(as[20]),
-                      from_bool(as[19]),
-                      from_bool(as[18]),
-                      from_bool(as[17]),
-                      from_bool(as[16]),
-                      from_bool(as[15]),
-                      from_bool(as[14]),
-                      from_bool(as[13]),
-                      from_bool(as[12]),
-                      from_bool(as[11]),
-                      from_bool(as[10]),
-                      from_bool(as[ 9]),
-                      from_bool(as[ 8]),
-                      from_bool(as[ 7]),
-                      from_bool(as[ 6]),
-                      from_bool(as[ 5]),
-                      from_bool(as[ 4]),
-                      from_bool(as[ 3]),
-                      from_bool(as[ 2]),
-                      from_bool(as[ 1]),
-                      from_bool(as[ 0]))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
-    boolvec& set_elt(int n, bool a)
-    {
-      return ((uint_t*)&v)[n]=from_bool(a), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return *this != boolvec(true); }
-    
-    boolvec operator&&(boolvec x) const 
-    {
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    boolvec operator||(boolvec x) const
-    {
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
-    }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const
-    {
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    
-    bool all() const
-    {
-      bool r = true;
-      for (int n=0; n<size; ++n) r = r && (*this)[n];
-      return r;
-    }
-    bool any() const
-    {
-      bool r = false;
-      for (int n=0; n<size; ++n) r = r || (*this)[n];
-      return r;
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<fp8,32>: floatprops<fp8>
-  {
-    static int const size = 32;
-    typedef int_t scalar_t;
-    typedef __m256i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm256_set1_epi8(a)) {}
-    intvec(int_t const* as):
-    v(_mm256_set_epi8(as[31],
-                      as[30],
-                      as[29],
-                      as[28],
-                      as[27],
-                      as[26],
-                      as[25],
-                      as[24],
-                      as[23],
-                      as[22],
-                      as[21],
-                      as[20],
-                      as[19],
-                      as[18],
-                      as[17],
-                      as[16],
-                      as[15],
-                      as[14],
-                      as[13],
-                      as[12],
-                      as[11],
-                      as[10],
-                      as[ 9],
-                      as[ 8],
-                      as[ 7],
-                      as[ 6],
-                      as[ 5],
-                      as[ 4],
-                      as[ 3],
-                      as[ 2],
-                      as[ 1],
-                      as[ 0])) {}
-    static intvec iota()
-    {
-      return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24,
-                             23, 22, 21, 20, 19, 18, 17, 16,
-                             15, 14, 13, 12, 11, 10, 9, 8,
-                             7, 6, 5, 4, 3, 2, 1, 0);
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
-    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return v; }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare with zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec x = *this;
-      // We know that boolvec values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(I(0)) - *this; }
-    
-    intvec operator+(intvec x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_add_epi8(vlo, xvlo);
-      vhi = _mm_add_epi8(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator-(intvec x) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_sub_epi8(vlo, xvlo);
-      vhi = _mm_sub_epi8(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec operator&(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    intvec operator|(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
-    }
-    intvec operator^(intvec x) const
-    {
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      uint_t masklo = U(0x00ffU) >> U(n);
-      uint_t maskhi = U(0xff00U);
-      __m128i mask = _mm_set1_epi16(masklo | maskhi);
-      vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask);
-      vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator>>(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      uint_t masklo = U(0x00ffU);
-      uint_t maskhi = U(0xff00U);
-      __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n+8),
-                                    _mm_set1_epi16(masklo));
-      __m128i vlohi = _mm_and_si128(_mm_srai_epi16(vlo, n),
-                                    _mm_set1_epi16(maskhi));
-      vlo = _mm_or_si128(vlolo, vlohi);
-      __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n+8),
-                                    _mm_set1_epi16(masklo));
-      __m128i vhihi = _mm_and_si128(_mm_srai_epi16(vhi, n),
-                                    _mm_set1_epi16(maskhi));
-      vhi = _mm_or_si128(vhilo, vhihi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec operator<<(int_t n) const
-    {
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      uint_t masklo = U(0x00ffU);
-      uint_t maskhi = U(0xff00U) << U(n);
-      __m128i mask = _mm_set1_epi16(masklo | maskhi);
-      vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask);
-      vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
-    }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec operator>>(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
-  };
-  
-  
-  
-  template<>
-  struct realvec<fp8,32>: floatprops<fp8>
-  {
-    static int const size = 32;
-    typedef real_t scalar_t;
-    typedef __m256i vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<AVX:32*fp8>"; }
-    void barrier() { __asm__("": "+x" (v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm256_set1_epi8(FP::as_int(a))) {}
-    realvec(real_t const* as):
-    v(_mm256_set_epi8(FP::as_int(as[31]),
-                      FP::as_int(as[30]),
-                      FP::as_int(as[29]),
-                      FP::as_int(as[28]),
-                      FP::as_int(as[27]),
-                      FP::as_int(as[26]),
-                      FP::as_int(as[25]),
-                      FP::as_int(as[24]),
-                      FP::as_int(as[23]),
-                      FP::as_int(as[22]),
-                      FP::as_int(as[21]),
-                      FP::as_int(as[20]),
-                      FP::as_int(as[19]),
-                      FP::as_int(as[18]),
-                      FP::as_int(as[17]),
-                      FP::as_int(as[16]),
-                      FP::as_int(as[15]),
-                      FP::as_int(as[14]),
-                      FP::as_int(as[13]),
-                      FP::as_int(as[12]),
-                      FP::as_int(as[11]),
-                      FP::as_int(as[10]),
-                      FP::as_int(as[ 9]),
-                      FP::as_int(as[ 8]),
-                      FP::as_int(as[ 7]),
-                      FP::as_int(as[ 6]),
-                      FP::as_int(as[ 5]),
-                      FP::as_int(as[ 4]),
-                      FP::as_int(as[ 3]),
-                      FP::as_int(as[ 2]),
-                      FP::as_int(as[ 1]),
-                      FP::as_int(as[ 0]))) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
-    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm256_load_si256((__m256i const*)p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm256_loadu_si256((__m256i const*)p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm256_store_si256((__m256i*)p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm256_storeu_si256((__m256i*)p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return v; }
-    intvec_t convert_int() const { __builtin_unreachable(); }
-    
-    
-    
-    realvec operator+() const { __builtin_unreachable(); }
-    realvec operator-() const { __builtin_unreachable(); }
-    
-    realvec operator+(realvec x) const { __builtin_unreachable(); }
-    realvec operator-(realvec x) const { __builtin_unreachable(); }
-    realvec operator*(realvec x) const { __builtin_unreachable(); }
-    realvec operator/(realvec x) const { __builtin_unreachable(); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t prod() const { __builtin_unreachable(); }
-    real_t sum() const { __builtin_unreachable(); }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
-    
-    
-    
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec fabs() const { return MF::vml_fabs(*this); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    boolvec_t signbit() const { return v; }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline
-  auto boolvec<fp8,32>::as_int() const -> intvec_t
-  {
-    return v;
-  }
-  
-  inline
-  auto boolvec<fp8,32>::convert_int() const -> intvec_t
-  {
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  auto boolvec<fp8,32>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  auto boolvec<fp8,32>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
-  {
-    return (( -convert_int() & x.as_int()) |
-            (~-convert_int() & y.as_int())).as_float();
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline auto intvec<fp8,32>::as_float() const -> realvec_t
-  {
-    return v;
-  }
-  
-  inline auto intvec<fp8,32>::convert_float() const -> realvec_t
-  {
-    __builtin_unreachable();
-  }
-  
-} // namespace vecmathlib
-
-#endif  // #ifndef VEC_FP8_AVX_H
diff --git a/vec_neon_float2.h b/vec_neon_float2.h
new file mode 100644
index 0000000..258a091
--- /dev/null
+++ b/vec_neon_float2.h
@@ -0,0 +1,558 @@
+// -*-C++-*-
+
+// <http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html>
+
+#ifndef VEC_NEON_FLOAT2_H
+#define VEC_NEON_FLOAT2_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// Neon intrinsics
+#include <arm_neon.h>
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_FLOAT_2
+  template<> struct boolvec<float,2>;
+  template<> struct intvec<float,2>;
+  template<> struct realvec<float,2>;
+  
+  
+  
+  template<>
+  struct boolvec<float,2>: floatprops<float>
+  {
+    static int const size = 2;
+    typedef bool scalar_t;
+    typedef uint32x2_t bvector_t;
+    static int const alignment = sizeof(bvector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // true values are -1, false values are 0
+    static uint_t from_bool(bool a) { return -int_t(a); }
+    static bool to_bool(uint_t a) { return a; }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a): v(vdup_n_u32(from_bool(a))) {}
+    boolvec(bool const* as)
+    {
+      for (int d=0; d<size; ++d) set_elt(d, as[d]);
+    }
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+    boolvec& set_elt(int n, bool a)
+    {
+      return ((uint_t*)&v)[n]=from_bool(a), *this;
+    }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return vmvn_u32(v); }
+    
+    boolvec operator&&(boolvec x) const { return vand_u32(v, x.v); }
+    boolvec operator||(boolvec x) const { return vorr_u32(v, x.v); }
+    boolvec operator==(boolvec x) const { return vceq_u32(v, x.v); }
+    boolvec operator!=(boolvec x) const { return veor_u32(v, x.v); }
+    
+    bool all() const
+    {
+      boolvec r = vpmin_u32(v, v);
+      return to_bool(r[0]);
+    }
+    bool any() const
+    {
+      boolvec r = vpmax_u32(v, v);
+      return to_bool(r[0]);
+    }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<float,2>: floatprops<float>
+  {
+    static int const size = 2;
+    typedef int_t scalar_t;
+    typedef int32x2_t ivector_t;
+    static int const alignment = sizeof(ivector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(vdup_n_s32(a)) {}
+    intvec(int_t const* as)
+    {
+      for (int d=0; d<size; ++d) set_elt(d, as[d]);
+    }
+    static intvec iota()
+    {
+      return vcreate_s32((uint64_t(0) << uint64_t(32)) | uint64_t(1));
+    }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+    
+    
+    
+    // Vector casts do not change the bit battern
+    boolvec_t as_bool() const { return vreinterpret_u32_s32(v); }
+    boolvec_t convert_bool() const { return *this != IV(0); }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    intvec operator+() const { return *this; }
+    intvec operator-() const { return vneg_s32(v); }
+    
+    intvec operator+(intvec x) const { return vadd_s32(v, x.v); }
+    intvec operator-(intvec x) const { return vsub_s32(v, x.v); }
+    intvec operator*(intvec x) const { return vmul_s32(v, x.v); }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    intvec& operator*=(intvec const& x) { return *this=*this*x; }
+    
+    
+    
+    intvec operator~() const { return vmvn_s32(v); }
+    
+    intvec operator&(intvec x) const { return vand_s32(v, x.v); }
+    intvec operator|(intvec x) const { return vorr_s32(v, x.v); }
+    intvec operator^(intvec x) const { return veor_s32(v, x.v); }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const { return lsr(IV(n)); }
+    intvec operator>>(int_t n) const { return *this >> IV(n); }
+    intvec operator<<(int_t n) const { return *this << IV(n); }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      return vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v), (-n).v));
+    }
+    intvec operator>>(intvec n) const
+    {
+      return vshl_s32(v, (-n).v);
+    }
+    intvec operator<<(intvec n) const
+    {
+      return vshl_s32(v, n.v);
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t signbit() const
+    {
+      //return *this < IV(I(0));
+      return intvec(vshr_n_s32(v, FP::bits-1)).as_bool();
+    }
+    
+    boolvec_t operator==(intvec const& x) const { return vceq_s32(v, x.v); }
+    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
+    boolvec_t operator<(intvec const& x) const { return vclt_s32(v, x.v); }
+    boolvec_t operator<=(intvec const& x) const { return vcle_s32(v, x.v); }
+    boolvec_t operator>(intvec const& x) const { return vcgt_s32(v, x.v); }
+    boolvec_t operator>=(intvec const& x) const { return vcge_s32(v, x.v); }
+  };
+  
+  
+  
+  template<>
+  struct realvec<float,2>: floatprops<float>
+  {
+    static int const size = 2;
+    typedef real_t scalar_t;
+    typedef float32x2_t vector_t;
+    static int const alignment = sizeof(vector_t);
+    
+    static char const* name() { return "<NEON:2*float>"; }
+    void barrier() { __asm__("": "+w" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(vdup_n_f32(a)) {}
+    realvec(real_t const* as)
+    {
+      for (int d=0; d<size; ++d) set_elt(d, as[d]);
+    }
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return vld1_f32(p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+#if defined __ARM_FEATURE_UNALIGNED
+      return vld1_f32(p);
+#else
+#  error "unaligned NEON loads not implemented"
+#endif
+    }
+    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      vst1_f32(p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      // Vector stores would require vector loads, which would need to
+      // be atomic
+      // p[0] = (*this)[0];
+      // p[1] = (*this)[1];
+#if defined __ARM_FEATURE_UNALIGNED
+      vst1_f32(p, v);
+#else
+#  error "unaligned NEON stores not implemented"
+#endif
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+      }
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return vreinterpret_s32_f32(v); }
+    intvec_t convert_int() const { return vcvt_s32_f32(v); }
+    
+    
+    
+    realvec operator+() const { return *this; }
+    realvec operator-() const { return vneg_f32(v); }
+    
+    realvec operator+(realvec x) const { return vadd_f32(v, x.v); }
+    realvec operator-(realvec x) const { return vsub_f32(v, x.v); }
+    realvec operator*(realvec x) const { return vmul_f32(v, x.v); }
+    realvec operator/(realvec x) const { return *this * x.rcp(); }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const
+    {
+      return (*this)[0] * (*this)[1];
+    }
+    real_t sum() const
+    {
+      realvec r = vpadd_f32(v, v);
+      return r[0];
+    }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const { return vceq_f32(v, x.v); }
+    boolvec_t operator!=(realvec const& x) const { return !(*this == x); }
+    boolvec_t operator<(realvec const& x) const { return vclt_f32(v, x.v); }
+    boolvec_t operator<=(realvec const& x) const { return vcle_f32(v, x.v); }
+    boolvec_t operator>(realvec const& x) const { return vcgt_f32(v, x.v); }
+    boolvec_t operator>=(realvec const& x) const { return vcge_f32(v, x.v); }
+    
+    
+    
+    realvec acos() const { return MF::vml_acos(*this); }
+    realvec acosh() const { return MF::vml_acosh(*this); }
+    realvec asin() const { return MF::vml_asin(*this); }
+    realvec asinh() const { return MF::vml_asinh(*this); }
+    realvec atan() const { return MF::vml_atan(*this); }
+    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+    realvec atanh() const { return MF::vml_atanh(*this); }
+    realvec cbrt() const { return MF::vml_cbrt(*this); }
+    realvec ceil() const
+    {
+      // return vrndp_f32(v);
+      return MF::vml_ceil(*this);
+    }
+    realvec copysign(realvec y) const
+    {
+      return vbsl_f32(vdup_n_u32(FP::signbit_mask), y.v, v);
+    }
+    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cosh() const { return MF::vml_cosh(*this); }
+    realvec exp() const { return MF::vml_exp(*this); }
+    realvec exp10() const { return MF::vml_exp10(*this); }
+    realvec exp2() const { return MF::vml_exp2(*this); }
+    realvec expm1() const { return MF::vml_expm1(*this); }
+    realvec fabs() const { return vabs_f32(v); }
+    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+    realvec floor() const
+    {
+      // return vrndm_f32(v);
+      return MF::vml_floor(*this);
+    }
+    realvec fma(realvec y, realvec z) const
+    {
+      // TODO: vfma_f32
+      return vmla_f32(z.v, v, y.v);
+    }
+    realvec fmax(realvec y) const { return vmax_f32(v, y.v); }
+    realvec fmin(realvec y) const { return vmin_f32(v, y.v); }
+    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+    boolvec_t isinf() const { return MF::vml_isinf(*this); }
+    boolvec_t isnan() const { return MF::vml_isnan(*this); }
+    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec log() const { return MF::vml_log(*this); }
+    realvec log10() const { return MF::vml_log10(*this); }
+    realvec log1p() const { return MF::vml_log1p(*this); }
+    realvec log2() const { return MF::vml_log2(*this); }
+    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+    realvec rcp() const
+    {
+      realvec r = vrecpe_f32(v);
+      r *= vrecps_f32(v, r);
+      r *= vrecps_f32(v, r);
+      return r;
+    }
+    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+    realvec rint() const
+    {
+      // return vrndn_f32(v);
+      return MF::vml_rint(*this);
+    }
+    realvec round() const
+    {
+      // return vrnda_f32(v);
+      return MF::vml_round(*this);
+    }
+    realvec rsqrt() const
+    {
+      realvec r = vrsqrte_f32(v);
+      r *= vrsqrts_f32(v, r*r);
+      r *= vrsqrts_f32(v, r*r);
+      return r;
+    }
+    boolvec_t signbit() const { return MF::vml_signbit(*this); }
+    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sinh() const { return MF::vml_sinh(*this); }
+    realvec sqrt() const { return *this * rsqrt(); }
+    realvec tan() const { return MF::vml_tan(*this); }
+    realvec tanh() const { return MF::vml_tanh(*this); }
+    realvec trunc() const
+    {
+      // return vrnd_f32(v);
+      return MF::vml_trunc(*this);
+    }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<float,2>::as_int() const -> intvec_t
+  {
+    return vreinterpret_s32_u32(v);
+  }
+  
+  inline
+  auto boolvec<float,2>::convert_int() const -> intvec_t
+  {
+    return - as_int();
+  }
+  
+  inline
+  auto boolvec<float,2>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return vbsl_s32(v, x.v, y.v);
+  }
+  
+  inline
+  auto boolvec<float,2>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return vbsl_f32(v, x.v, y.v);
+  }
+  
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<float,2>::as_float() const -> realvec_t
+  {
+    return vreinterpret_f32_s32(v);
+  }
+  
+  inline auto intvec<float,2>::convert_float() const -> realvec_t
+  {
+    return vcvt_f32_s32(v);
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_NEON_FLOAT2_H
diff --git a/vec_qpx_double4.h b/vec_qpx_double4.h
new file mode 100644
index 0000000..e7a1c05
--- /dev/null
+++ b/vec_qpx_double4.h
@@ -0,0 +1,667 @@
+// -*-C++-*-
+
+#ifndef VEC_QPX_DOUBLE4_H
+#define VEC_QPX_DOUBLE4_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+#warning "TODO"
+#include <iostream>
+
+// QPX intrinsics
+#ifdef __clang__
+#  include <qpxintrin.h>
+#else
+#  include <builtins.h>
+#endif
+#include <mass_simd.h>
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_DOUBLE_4
+  template<> struct boolvec<double,4>;
+  template<> struct intvec<double,4>;
+  template<> struct realvec<double,4>;
+  
+  
+  
+  template<>
+  struct boolvec<double,4>: floatprops<double>
+  {
+    static int const size = 4;
+    typedef bool scalar_t;
+    typedef vector4double bvector_t;
+    static int const alignment = sizeof(bvector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // canonical true is +1.0, canonical false is -1.0
+    // >=0 is true, -0 is true, nan is false
+    static real_t from_bool(bool a) { return a ? +1.0 : -1.0; }
+    static bool to_bool(real_t a) { return a>=0.0; }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a): v(vec_splats(from_bool(a))) {}
+    boolvec(bool const* as)
+    {
+      for (int d=0; d<size; ++d) set_elt(d, as[d]);
+    }
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const
+    {
+      // return to_bool(((real_t const*)&v)[n]);
+      return to_bool(v[n]);
+    }
+    boolvec& set_elt(int n, bool a)
+    {
+      // return ((real_t*)&v)[n]=from_bool(a), *this;
+      return v[n]=from_bool(a), *this;
+    }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return vec_not(v); }
+    
+    boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
+    boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
+    boolvec operator==(boolvec x) const { return vec_logical(v, x.v, 0x9); }
+    boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
+    
+    bool all() const
+    {
+      return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+    }
+    bool any() const
+    {
+      return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+    }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<double,4>: floatprops<double>
+  {
+    static int const size = 4;
+    typedef int_t scalar_t;
+    typedef vector4double ivector_t;
+    static int const alignment = sizeof(ivector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(vec_splats(FP::as_float(a))) {}
+    intvec(int_t const* as)
+    {
+      for (int d=0; d<size; ++d) set_elt(d, as[d]);
+    }
+    static intvec iota()
+    {
+      const int_t iota_[] = {0, 1, 2, 4};
+      return intvec(iota_);
+    }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const
+    {
+      // return ((int_t const*)&v)[n];
+      return FP::as_int(v[n]);
+    }
+    intvec& set_elt(int n, int_t a)
+    {
+      // return ((int_t*)&v)[n]=a, *this;
+      return v[n]=FP::as_float(a), *this;
+    }
+    
+    
+    
+    // Vector casts do not change the bit battern
+    boolvec_t as_bool() const { return v; }
+    boolvec_t convert_bool() const { return *this != IV(I(0)); }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    intvec operator+() const { return *this; }
+    intvec operator-() const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, -(*this)[d]);
+      return r;
+    }
+    
+    intvec operator+(intvec x) const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] + x[d]);
+      return r;
+    }
+    intvec operator-(intvec x) const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] - x[d]);
+      return r;
+    }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    
+    
+    
+    intvec operator~() const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, ~(*this)[d]);
+      return r;
+    }
+    
+    intvec operator&(intvec x) const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] & x[d]);
+      return r;
+    }
+    intvec operator|(intvec x) const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] | x[d]);
+      return r;
+    }
+    intvec operator^(intvec x) const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] ^ x[d]);
+      return r;
+    }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, U((*this)[d]) >> U(n));
+      return r;
+    }
+    intvec operator>>(int_t n) const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >> n);
+      return r;
+    }
+    intvec operator<<(int_t n) const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] << n);
+      return r;
+    }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, U((*this)[d]) >> U(n[d]));
+      return r;
+    }
+    intvec operator>>(intvec n) const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >> n[d]);
+      return r;
+    }
+    intvec operator<<(intvec n) const
+    {
+      intvec r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] << n[d]);
+      return r;
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t signbit() const
+    {
+      return *this < IV(I(0));
+    }
+    
+    boolvec_t operator==(intvec const& x) const
+    {
+      boolvec_t r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] == x[d]);
+      return r;
+    }
+    boolvec_t operator!=(intvec const& x) const
+    {
+      boolvec_t r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] != x[d]);
+      return r;
+    }
+    boolvec_t operator<(intvec const& x) const
+    {
+      boolvec_t r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] < x[d]);
+      return r;
+    }
+    boolvec_t operator<=(intvec const& x) const
+    {
+      boolvec_t r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] <= x[d]);
+      return r;
+    }
+    boolvec_t operator>(intvec const& x) const
+    {
+      boolvec_t r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] > x[d]);
+      return r;
+    }
+    boolvec_t operator>=(intvec const& x) const
+    {
+      boolvec_t r;
+      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >= x[d]);
+      return r;
+    }
+  };
+  
+  
+  
+  template<>
+  struct realvec<double,4>: floatprops<double>
+  {
+    static int const size = 4;
+    typedef real_t scalar_t;
+    typedef vector4double vector_t;
+    static int const alignment = sizeof(vector_t);
+    
+    static char const* name() { return "<QPX:4*double>"; }
+    void barrier() { __asm__("": "+v" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(vec_splats(a)) {}
+    realvec(real_t const* as)
+    {
+      for (int d=0; d<size; ++d) set_elt(d, as[d]);
+    }
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const
+    {
+      // return ((real_t const*)&v)[n];
+      return v[n];
+    }
+    realvec& set_elt(int n, real_t a)
+    {
+      // return ((real_t*)&v)[n]=a, *this;
+      return v[n]=a, *this;
+    }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return vec_lda(0, (real_t*)p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      realvec_t v0 = vec_ld(0, (real_t*)p);
+      realvec_t v1 = vec_ld(31, (real_t*)p);
+      return vec_perm(v0.v, v1.v, vec_lvsl(0, (real_t*)p));
+    }
+    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff);
+      // TODO: use load instruction with fixed offset
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff, m);
+      // TODO: use load instruction with fixed offset
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+#warning "TODO"
+      std::cout << "yes this is storea\n";
+      vec_sta(v, 0, p);
+    }
+    void storeu(real_t* p) const
+    {
+      // Vector stores would require vector loads, which would need to
+      // be atomic
+      // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
+      p[0] = (*this)[0];
+      p[1] = (*this)[1];
+      p[2] = (*this)[2];
+      p[3] = (*this)[3];
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+        if (m.m[2]) p[2] = (*this)[2];
+        if (m.m[3]) p[3] = (*this)[3];
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+        if (m.m[2]) p[2] = (*this)[2];
+        if (m.m[3]) p[3] = (*this)[3];
+      }
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return v; }
+    intvec_t convert_int() const { return vec_ctid(v); }
+    
+    
+    
+    realvec operator+() const { return *this; }
+    realvec operator-() const { return vec_neg(v); }
+    
+    realvec operator+(realvec x) const { return vec_add(v, x.v); }
+    realvec operator-(realvec x) const { return vec_sub(v, x.v); }
+    realvec operator*(realvec x) const { return vec_mul(v, x.v); }
+    realvec operator/(realvec x) const
+    {
+      // return vec_swdiv_nochk(v, x.v);
+      return div_fastd4(v, x.v);
+    }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const
+    {
+      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+    }
+    real_t sum() const
+    {
+      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+    }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); }
+    boolvec_t operator!=(realvec const& x) const { return ! (*this == x); }
+    boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); }
+    boolvec_t operator<=(realvec const& x) const { return ! (*this > x); }
+    boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); }
+    boolvec_t operator>=(realvec const& x) const { return ! (*this < x); }
+    
+    
+    
+    realvec acos() const { return acosd4(v); }
+    realvec acosh() const { return acoshd4(v); }
+    realvec asin() const { return asind4(v); }
+    realvec asinh() const { return asinhd4(v); }
+    realvec atan() const { return atand4(v); }
+    realvec atan2(realvec y) const { return atan2d4(v, y.v); }
+    realvec atanh() const { return atanhd4(v); }
+    realvec cbrt() const { return cbrtd4(v); }
+    realvec ceil() const { return vec_ceil(v); }
+    realvec copysign(realvec y) const { return vec_cpsgn(v, y.v); }
+    realvec cos() const { return cosd4(v); }
+    realvec cosh() const { return coshd4(v); }
+    realvec exp() const { return expd4(v); }
+    realvec exp10() const { return exp10d4(v); }
+    realvec exp2() const { return exp2d4(v); }
+    realvec expm1() const { return expm1d4(v); }
+    realvec fabs() const { return vec_abs(v); }
+    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+    realvec floor() const { return vec_floor(v); }
+    realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
+    realvec fmax(realvec y) const { return MF::vml_fmax(v, y.v); }
+    realvec fmin(realvec y) const { return MF::vml_fmin(v, y.v); }
+    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+    realvec hypot(realvec y) const { return hypotd4(v, y.v); }
+    intvec_t ilogb() const
+    {
+      int_t ilogb_[] = {
+	::ilogb((*this)[0]),
+	::ilogb((*this)[1]),
+	::ilogb((*this)[2]),
+	::ilogb((*this)[3])
+      };
+      return intvec_t(ilogb_);
+    }
+    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+    boolvec_t isinf() const { return MF::vml_isinf(*this); }
+    boolvec_t isnan() const { return vec_tstnan(v, v); }
+    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+    realvec ldexp(int_t n) const { return ldexp(intvec_t(n)); }
+    realvec ldexp(intvec_t n) const
+    {
+      real_t ldexp_[] = {
+	std::ldexp((*this)[0], n[0]),
+        std::ldexp((*this)[1], n[1]),
+        std::ldexp((*this)[2], n[2]),
+        std::ldexp((*this)[3], n[3])
+      };
+      return realvec_t(ldexp_);
+    }
+    realvec log() const { return logd4(v); }
+    realvec log10() const { return log10d4(v); }
+    realvec log1p() const { return log1pd4(v); }
+    realvec log2() const { return log2d4(v); }
+    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+    realvec pow(realvec y) const { return powd4(v, y.v); }
+    realvec rcp() const { return recip_fastd4(v); }
+    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+    realvec rint() const { return MF::vml_rint(*this); }
+    realvec round() const { return vec_round(v); }
+    realvec rsqrt() const
+    {
+      realvec x = *this;
+      realvec r = vec_rsqrte(x.v); // this is only an approximation
+      // TODO: use fma
+      // one Newton iteration (see vml_rsqrt)
+      r += RV(0.5)*r * (RV(1.0) - x * r*r);
+      return r;
+    }
+    boolvec_t signbit() const { return !copysign(RV(1.0)).as_int().as_bool(); }
+    realvec sin() const { return sind4(v); }
+    realvec sinh() const { return sinhd4(v); }
+    realvec sqrt() const
+    {
+      // return vec_sqrtsw_nochk(v);
+      return *this * rsqrt();
+    }
+    realvec tan() const { return tand4(v); }
+    realvec tanh() const { return tanhd4(v); }
+    realvec trunc() const { return vec_trunc(v); }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  boolvec<double,4>::intvec_t boolvec<double,4>::as_int() const
+  {
+    return v;
+  }
+  
+  inline
+  boolvec<double,4>::intvec_t boolvec<double,4>::convert_int() const
+  {
+    return ifthen(IV(I(1)), IV(I(0)));
+  }
+  
+  inline
+  boolvec<double,4>::intvec_t boolvec<double,4>::ifthen(intvec_t x,
+                                                        intvec_t y) const
+  {
+    return ifthen(x.as_float(), y.as_float()).as_int();
+  }
+  
+  inline
+  boolvec<double,4>::realvec_t boolvec<double,4>::ifthen(realvec_t x,
+                                                         realvec_t y) const
+  {
+    return vec_sel(y.v, x.v, v);
+  }
+  
+  
+  
+  // intvec definitions
+  
+  inline intvec<double,4>::realvec_t intvec<double,4>::as_float() const
+  {
+    return v;
+  }
+  
+  inline intvec<double,4>::realvec_t intvec<double,4>::convert_float() const
+  {
+    return vec_cfid(v);
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_QPX_DOUBLE4_H
diff --git a/vec_sse_double1.h b/vec_sse_double1.h
new file mode 100644
index 0000000..ff1145e
--- /dev/null
+++ b/vec_sse_double1.h
@@ -0,0 +1,528 @@
+// -*-C++-*-
+
+#ifndef VEC_SSE_DOUBLE1_H
+#define VEC_SSE_DOUBLE1_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// SSE2 intrinsics
+#include <emmintrin.h>
+#ifdef __SSE3__                 // Intel's SSE 3
+#  include <pmmintrin.h>
+#endif
+#ifdef __SSE4_1__               // Intel's SSE 4.1
+#  include <smmintrin.h>
+#endif
+#ifdef __SSE4A__                // AMD's SSE 4a
+#  include <ammintrin.h>
+#endif
+#if defined __AVX__             // Intel's AVX
+#  include <immintrin.h>
+#endif
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_DOUBLE_1
+  template<> struct boolvec<double,1>;
+  template<> struct intvec<double,1>;
+  template<> struct realvec<double,1>;
+  
+  
+  
+  template<>
+  struct boolvec<double,1>: floatprops<double>
+  {
+    static int const size = 1;
+    typedef bool scalar_t;
+    typedef uint_t bvector_t;
+    static int const alignment = sizeof(bvector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+    // true values are non-zero, false values are zero
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a): v(a) {}
+    // TODO: remove this
+    boolvec(int x): v(x) {}
+    boolvec(bool const* as): v(as[0]) {}
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return v; }
+    boolvec& set_elt(int n, bool a) { return v=a, *this; }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return !v; }
+    
+    boolvec operator&&(boolvec x) const { return v && x.v; }
+    boolvec operator||(boolvec x) const { return v || x.v; }
+    boolvec operator==(boolvec x) const { return bool(v) == bool(x.v); }
+    boolvec operator!=(boolvec x) const { return bool(v) != bool(x.v); }
+    
+    bool all() const { return v; }
+    bool any() const { return v; }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<double,1>: floatprops<double>
+  {
+    static int const size = 1;
+    typedef int_t scalar_t;
+    typedef int_t ivector_t;
+    static int const alignment = sizeof(ivector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(int_t a): v(a) {}
+    intvec(int_t const* as): v(as[0]) {}
+    static intvec iota() { return intvec(I(0)); }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return v; }
+    intvec& set_elt(int n, int_t a) { return v=a, *this; }
+    
+    
+    
+    boolvec_t as_bool() const { return U(v); }
+    boolvec_t convert_bool() const { return bool(v); }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    intvec operator+() const { return +v; }
+    intvec operator-() const { return -v; }
+    
+    intvec operator+(intvec x) const { return v+x.v; }
+    intvec operator-(intvec x) const { return v-x.v; }
+    intvec operator*(intvec x) const { return v*x.v; }
+    intvec operator/(intvec x) const { return v/x.v; }
+    intvec operator%(intvec x) const { return v%x.v; }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    intvec& operator*=(intvec const& x) { return *this=*this*x; }
+    intvec& operator/=(intvec const& x) { return *this=*this/x; }
+    intvec& operator%=(intvec const& x) { return *this=*this%x; }
+    
+    
+    
+    intvec operator~() const { return ~v; }
+    
+    intvec operator&(intvec x) const { return v&x.v; }
+    intvec operator|(intvec x) const { return v|x.v; }
+    intvec operator^(intvec x) const { return v^x.v; }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const { return U(v) >> U(n); }
+    intvec operator>>(int_t n) const { return v>>n; }
+    intvec operator<<(int_t n) const { return v<<n; }
+    
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const { return U(v) >> U(n); }
+    intvec operator>>(intvec n) const { return v>>n; }
+    intvec operator<<(intvec n) const { return v<<n; }
+    
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t signbit() const { return *this < IV(I(0)); }
+    
+    boolvec_t operator==(intvec const& x) const { return v==x.v; }
+    boolvec_t operator!=(intvec const& x) const { return v!=x.v; }
+    boolvec_t operator<(intvec const& x) const { return v<x.v; }
+    boolvec_t operator<=(intvec const& x) const { return v<=x.v; }
+    boolvec_t operator>(intvec const& x) const { return v>x.v; }
+    boolvec_t operator>=(intvec const& x) const { return v>=x.v; }
+  };
+  
+  
+  
+  template<>
+  struct realvec<double,1>: floatprops<double>
+  {
+    static int const size = 1;
+    typedef real_t scalar_t;
+    typedef double vector_t;
+    static int const alignment = sizeof(vector_t);
+    
+    static char const* name() { return "<SSE2:1*double>"; }
+    void barrier() { __asm__("": "+x" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+  private:
+    static __m128d from_double(double a) { return _mm_set_sd(a); }
+    static double to_double(__m128d a) { return _mm_cvtsd_f64(a); }
+  public:
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(real_t a): v(a) {}
+    realvec(real_t const* as): v(as[0]) {}
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return v; }
+    realvec& set_elt(int n, real_t a) { return v=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return *p;
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return *p;
+    }
+    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return loada(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return *this;
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return *this;
+      }
+    }
+    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return loada(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      *p = v;
+    }
+    void storeu(real_t* p) const
+    {
+      *p = v;
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      storea(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      }
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      storea(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return floatprops::as_int(v); }
+    intvec_t convert_int() const {
+#ifdef __x86_64__
+      return _mm_cvttsd_si64(_mm_set_sd(v));
+#else
+      return floatprops::convert_int(v);
+#endif
+    }
+    
+    
+    
+    realvec operator+() const { return +v; }
+    realvec operator-() const { return -v; }
+    
+    realvec operator+(realvec x) const { return v+x.v; }
+    realvec operator-(realvec x) const { return v-x.v; }
+    realvec operator*(realvec x) const { return v*x.v; }
+    realvec operator/(realvec x) const { return v/x.v; }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const { return v; }
+    real_t sum() const { return v; }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const { return v==x.v; }
+    boolvec_t operator!=(realvec const& x) const { return v!=x.v; }
+    boolvec_t operator<(realvec const& x) const { return v<x.v; }
+    boolvec_t operator<=(realvec const& x) const { return v<=x.v; }
+    boolvec_t operator>(realvec const& x) const { return v>x.v; }
+    boolvec_t operator>=(realvec const& x) const { return v>=x.v; }
+    
+    
+    
+    realvec acos() const { return MF::vml_acos(*this); }
+    realvec acosh() const { return MF::vml_acosh(*this); }
+    realvec asin() const { return MF::vml_asin(*this); }
+    realvec asinh() const { return MF::vml_asinh(*this); }
+    realvec atan() const { return MF::vml_atan(*this); }
+    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+    realvec atanh() const { return MF::vml_atanh(*this); }
+    realvec cbrt() const { return MF::vml_cbrt(*this); }
+    realvec ceil() const
+    {
+#ifdef __SSE4_1__
+      return to_double(_mm_ceil_sd(from_double(v), from_double(v)));
+#else
+      return std::ceil(v);
+#endif
+    }
+    realvec copysign(realvec y) const { return std::copysign(v, y.v); }
+    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cosh() const { return MF::vml_cosh(*this); }
+    realvec exp() const { return MF::vml_exp(*this); }
+    realvec exp10() const { return MF::vml_exp10(*this); }
+    realvec exp2() const { return MF::vml_exp2(*this); }
+    realvec expm1() const { return MF::vml_expm1(*this); }
+    realvec fabs() const { return std::fabs(v); }
+    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+    realvec floor() const
+    {
+#ifdef __SSE4_1__
+      return to_double(_mm_floor_sd(from_double(v), from_double(v)));
+#else
+      return std::floor(v);
+#endif
+    }
+    realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); }
+    realvec fmax(realvec y) const
+    {
+      return to_double(_mm_max_sd(from_double(v), from_double(y.v)));
+    }
+    realvec fmin(realvec y) const
+    {
+      return to_double(_mm_min_sd(from_double(v), from_double(y.v)));
+    }
+    realvec fmod(realvec y) const { return std::fmod(v, y.v); }
+    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+    intvec_t ilogb() const
+    {
+      int_t r = std::ilogb(v);
+      if (r == FP_ILOGB0) r = numeric_limits<int_t>::min();
+      else if (r == FP_ILOGBNAN) r = numeric_limits<int_t>::max();
+      return r;
+    }
+    boolvec_t isfinite() const { return std::isfinite(v); }
+    boolvec_t isinf() const { return std::isinf(v); }
+    boolvec_t isnan() const
+    {
+      return _mm_ucomineq_sd(from_double(v), from_double(v));
+    }
+    boolvec_t isnormal() const { return std::isnormal(v); }
+    realvec ldexp(int_t n) const { return std::ldexp(v, n); }
+    realvec ldexp(intvec_t n) const { return std::ldexp(v, n); }
+    realvec log() const { return MF::vml_log(*this); }
+    realvec log10() const { return MF::vml_log10(*this); }
+    realvec log1p() const { return MF::vml_log1p(*this); }
+    realvec log2() const { return MF::vml_log2(*this); }
+    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+    realvec rcp() const { return R(1.0)/v; }
+    realvec remainder(realvec y) const { return std::remainder(v, y.v); }
+    realvec rint() const
+    {
+#ifdef __SSE4_1__
+      return to_double(_mm_round_sd(from_double(v), from_double(v),
+                                    _MM_FROUND_TO_NEAREST_INT));
+#else
+      return MF::vml_rint(*this);
+#endif
+    }
+    realvec round() const { return MF::vml_round(*this); }
+    realvec rsqrt() const { return MF::vml_rsqrt(*this); }
+    boolvec_t signbit() const { return std::signbit(v); }
+    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sinh() const { return MF::vml_sinh(*this); }
+    realvec sqrt() const
+    {
+      return to_double(_mm_sqrt_sd(from_double(v), from_double(v)));
+    }
+    realvec tan() const { return MF::vml_tan(*this); }
+    realvec tanh() const { return MF::vml_tanh(*this); }
+    realvec trunc() const
+    {
+#ifdef __SSE4_1__
+      return to_double(_mm_round_sd(from_double(v), from_double(v),
+                                    _MM_FROUND_TO_ZERO));
+#else
+      return MF::vml_trunc(*this);
+#endif
+    }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<double,1>::as_int() const -> intvec_t
+  {
+    return I(v);
+  }
+  
+  inline
+  auto boolvec<double,1>::convert_int() const -> intvec_t
+  {
+    return v;
+  }
+  
+  inline
+  auto boolvec<double,1>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return v ? x : y;
+  }
+  
+  inline
+  auto boolvec<double,1>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return v ? x : y;
+  }
+  
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<double,1>::as_float() const -> realvec_t
+  {
+    return FP::as_float(v);
+  }
+  
+  inline auto intvec<double,1>::convert_float() const -> realvec_t
+  {
+#ifdef __x86_64__
+    return _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(), v));
+#else
+    return FP::convert_float(v);
+#endif
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_SSE_DOUBLE1_H
diff --git a/vec_sse_double2.h b/vec_sse_double2.h
new file mode 100644
index 0000000..2cbbd38
--- /dev/null
+++ b/vec_sse_double2.h
@@ -0,0 +1,646 @@
+// -*-C++-*-
+
+#ifndef VEC_SSE_DOUBLE2_H
+#define VEC_SSE_DOUBLE2_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// SSE2 intrinsics
+#include <emmintrin.h>
+#ifdef __SSE3__                 // Intel's SSE 3
+#  include <pmmintrin.h>
+#endif
+#ifdef __SSE4_1__               // Intel's SSE 4.1
+#  include <smmintrin.h>
+#endif
+#ifdef __SSE4A__                // AMD's SSE 4a
+#  include <ammintrin.h>
+#endif
+#if defined __AVX__             // Intel's AVX
+#  include <immintrin.h>
+#endif
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_DOUBLE_2
+  template<> struct boolvec<double,2>;
+  template<> struct intvec<double,2>;
+  template<> struct realvec<double,2>;
+  
+  
+  
+  template<>
+  struct boolvec<double,2>: floatprops<double>
+  {
+    static int const size = 2;
+    typedef bool scalar_t;
+    typedef __m128d bvector_t;
+    static int const alignment = sizeof(bvector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // true values have the sign bit set, false values have it unset
+    static uint_t from_bool(bool a) { return - uint_t(a); }
+    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a):
+    v(_mm_castsi128_pd(_mm_set1_epi64x(from_bool(a)))) {}
+    boolvec(bool const* as):
+    v(_mm_castsi128_pd(_mm_set_epi64x(from_bool(as[1]), from_bool(as[0])))) {}
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+    boolvec& set_elt(int n, bool a)
+    {
+      return ((uint_t*)&v)[n]=from_bool(a), *this;
+    }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return _mm_xor_pd(boolvec(true), v); }
+    
+    boolvec operator&&(boolvec x) const { return _mm_and_pd(v, x.v); }
+    boolvec operator||(boolvec x) const { return _mm_or_pd(v, x.v); }
+    boolvec operator==(boolvec x) const { return !(*this!=x); }
+    boolvec operator!=(boolvec x) const { return _mm_xor_pd(v, x.v); }
+    
+    bool all() const
+    {
+      // return (*this)[0] && (*this)[1];
+#if defined __AVX__
+      return ! (! *this).any();
+#else
+      boolvec x = *this;
+      x = x && _mm_shuffle_pd(x.v, x.v, _MM_SHUFFLE2(0,1));
+      return x[0];
+#endif
+    }
+    bool any() const
+    {
+      // return (*this)[0] || (*this)[1];
+#if defined __AVX__
+      return ! _mm_testz_pd(v, v);
+#else
+      boolvec x = *this;
+      x = x || _mm_shuffle_pd(x.v, x.v, _MM_SHUFFLE2(0,1));
+      return x[0];
+#endif
+    }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<double,2>: floatprops<double>
+  {
+    static int const size = 2;
+    typedef int_t scalar_t;
+    typedef __m128i ivector_t;
+    static int const alignment = sizeof(ivector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(_mm_set1_epi64x(a)) {}
+    intvec(int_t const* as): v(_mm_set_epi64x(as[1], as[0])) {}
+    static intvec iota() { return _mm_set_epi64x(1, 0); }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+    
+    
+    
+    boolvec_t as_bool() const { return _mm_castsi128_pd(v); }
+    boolvec_t convert_bool() const
+    {
+      // Result: convert_bool(0)=false, convert_bool(else)=true
+      // There is no intrinsic to compare with zero. Instead, we check
+      // whether x is positive and x-1 is negative.
+      intvec x = *this;
+      // We know that boolvec values depend only on the sign bit
+      // return (~(x-1) | x).as_bool();
+      // return x.as_bool() || !(x-1).as_bool();
+      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+    }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    // Note: not all arithmetic operations are supported!
+    
+    intvec operator+() const { return *this; }
+    intvec operator-() const { return IV(I(0)) - *this; }
+    
+    intvec operator+(intvec x) const { return _mm_add_epi64(v, x.v); }
+    intvec operator-(intvec x) const { return _mm_sub_epi64(v, x.v); }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    
+    
+    
+    intvec operator~() const { return IV(~U(0)) ^ *this; }
+    
+    intvec operator&(intvec x) const
+    {
+      return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(v),
+                                         _mm_castsi128_pd(x.v)));
+    }
+    intvec operator|(intvec x) const
+    {
+      return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(v),
+                                        _mm_castsi128_pd(x.v)));
+    }
+    intvec operator^(intvec x) const
+    {
+      return _mm_castpd_si128(_mm_xor_pd(_mm_castsi128_pd(v),
+                                         _mm_castsi128_pd(x.v)));
+    }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const { return _mm_srli_epi64(v, n); }
+    intvec operator>>(int_t n) const
+    {
+      // There is no _mm_srai_epi64. To emulate it, add 0x80000000
+      // before shifting, and subtract the shifted 0x80000000 after
+      // shifting
+      intvec x = *this;
+      // Convert signed to unsiged
+      x += U(1) << (bits-1);
+      // Shift
+      x = x.lsr(n);
+      // Undo conversion
+      x -= U(1) << (bits-1-n);
+      return x;
+    }
+    intvec operator<<(int_t n) const { return _mm_slli_epi64(v, n); }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, U((*this)[i]) >> U(n[i]));
+      }
+      return r;
+    }
+    intvec operator>>(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] >> n[i]);
+      }
+      return r;
+    }
+    intvec operator<<(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] << n[i]);
+      }
+      return r;
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t signbit() const
+    {
+      return as_bool();
+    }
+    
+    boolvec_t operator==(intvec const& x) const
+    {
+      return ! (*this != x);
+    }
+    boolvec_t operator!=(intvec const& x) const
+    {
+      return (*this ^ x).convert_bool();
+    }
+    boolvec_t operator<(intvec const& x) const
+    {
+      // return (*this - x).as_bool();
+      boolvec_t r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] < x[i]);
+      }
+      return r;
+    }
+    boolvec_t operator<=(intvec const& x) const
+    {
+      return ! (*this > x);
+    }
+    boolvec_t operator>(intvec const& x) const
+    {
+      return x < *this;
+    }
+    boolvec_t operator>=(intvec const& x) const
+    {
+      return ! (*this < x);
+    }
+  };
+  
+  
+  
+  template<>
+  struct realvec<double,2>: floatprops<double>
+  {
+    static int const size = 2;
+    typedef real_t scalar_t;
+    typedef __m128d vector_t;
+    static int const alignment = sizeof(vector_t);
+    
+    static char const* name() { return "<SSE2:2*double>"; }
+    void barrier() { __asm__("": "+x" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(_mm_set1_pd(a)) {}
+    realvec(real_t const* as): v(_mm_set_pd(as[1], as[0])) {}
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return _mm_load_pd(p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return _mm_loadu_pd(p);
+    }
+    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      _mm_store_pd(p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      return _mm_storeu_pd(p, v);
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+#if defined __AVX__
+        _mm_maskstore_pd(p, m.m.as_int(), v);
+#else
+        if      (m.m[0]) _mm_storel_pd(p  , v);
+        else if (m.m[1]) _mm_storeh_pd(p+1, v);
+#endif
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        if      (m.m[0]) _mm_storel_pd(p  , v);
+        else if (m.m[1]) _mm_storeh_pd(p+1, v);
+      }
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return _mm_castpd_si128(v); }
+    intvec_t convert_int() const { return MF::vml_convert_int(*this); }
+    
+    
+    
+    realvec operator+() const { return *this; }
+    realvec operator-() const { return RV(0.0) - *this; }
+    
+    realvec operator+(realvec x) const { return _mm_add_pd(v, x.v); }
+    realvec operator-(realvec x) const { return _mm_sub_pd(v, x.v); }
+    realvec operator*(realvec x) const { return _mm_mul_pd(v, x.v); }
+    realvec operator/(realvec x) const { return _mm_div_pd(v, x.v); }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const
+    {
+      return (*this)[0] * (*this)[1];
+    }
+    real_t sum() const
+    {
+#ifdef __SSE3__
+      return _mm_cvtsd_f64(_mm_hadd_pd(v, v));
+#else
+      return (*this)[0] + (*this)[1];
+#endif
+    }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const
+    {
+      return _mm_cmpeq_pd(v, x.v);
+    }
+    boolvec_t operator!=(realvec const& x) const
+    {
+      return _mm_cmpneq_pd(v, x.v);
+    }
+    boolvec_t operator<(realvec const& x) const
+    {
+      return _mm_cmplt_pd(v, x.v);
+    }
+    boolvec_t operator<=(realvec const& x) const
+    {
+      return _mm_cmple_pd(v, x.v);
+    }
+    boolvec_t operator>(realvec const& x) const
+    {
+      return _mm_cmpgt_pd(v, x.v);
+    }
+    boolvec_t operator>=(realvec const& x) const
+    {
+      return _mm_cmpge_pd(v, x.v);
+    }
+    
+    
+    
+    realvec acos() const { return MF::vml_acos(*this); }
+    realvec acosh() const { return MF::vml_acosh(*this); }
+    realvec asin() const { return MF::vml_asin(*this); }
+    realvec asinh() const { return MF::vml_asinh(*this); }
+    realvec atan() const { return MF::vml_atan(*this); }
+    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+    realvec atanh() const { return MF::vml_atanh(*this); }
+    realvec cbrt() const { return MF::vml_cbrt(*this); }
+    realvec ceil() const
+    {
+#ifdef __SSE4_1__
+      return _mm_ceil_pd(v);
+#else
+      return MF::vml_ceil(*this);
+#endif
+ }
+    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cosh() const { return MF::vml_cosh(*this); }
+    realvec exp() const { return MF::vml_exp(*this); }
+    realvec exp10() const { return MF::vml_exp10(*this); }
+    realvec exp2() const { return MF::vml_exp2(*this); }
+    realvec expm1() const { return MF::vml_expm1(*this); }
+    realvec fabs() const { return MF::vml_fabs(*this); }
+    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+    realvec floor() const
+    {
+#ifdef __SSE4_1__
+      return _mm_floor_pd(v);
+#else
+      return MF::vml_floor(*this);
+#endif
+ }
+    realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); }
+    realvec fmax(realvec y) const { return _mm_max_pd(v, y.v); }
+    realvec fmin(realvec y) const { return _mm_min_pd(v, y.v); }
+    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+    boolvec_t isinf() const { return MF::vml_isinf(*this); }
+    boolvec_t isnan() const { return _mm_cmpunord_pd(v, v);; }
+    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec log() const { return MF::vml_log(*this); }
+    realvec log10() const { return MF::vml_log10(*this); }
+    realvec log1p() const { return MF::vml_log1p(*this); }
+    realvec log2() const { return MF::vml_log2(*this); }
+    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+    realvec rcp() const { return _mm_div_pd(_mm_set1_pd(1.0), v); }
+    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+    realvec rint() const
+    {
+#ifdef __SSE4_1__
+      return _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
+#else
+      return MF::vml_rint(*this);
+#endif
+    }
+    realvec round() const { return MF::vml_round(*this); }
+    realvec rsqrt() const { return MF::vml_rsqrt(*this); }
+    boolvec_t signbit() const { return v; }
+    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sinh() const { return MF::vml_sinh(*this); }
+    realvec sqrt() const { return _mm_sqrt_pd(v); }
+    realvec tan() const { return MF::vml_tan(*this); }
+    realvec tanh() const { return MF::vml_tanh(*this); }
+    realvec trunc() const
+    {
+#ifdef __SSE4_1__
+      return _mm_round_pd(v, _MM_FROUND_TO_ZERO);
+#else
+      return MF::vml_trunc(*this);
+#endif
+ }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<double,2>::as_int() const -> intvec_t
+  {
+    return _mm_castpd_si128(v);
+  }
+  
+  inline
+  auto boolvec<double,2>::convert_int() const -> intvec_t
+  {
+    //return ifthen(v, U(1), U(0));
+    return lsr(as_int(), bits-1);
+  }
+  
+  inline
+  auto boolvec<double,2>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return ifthen(x.as_float(), y.as_float()).as_int();
+  }
+  
+  inline
+  auto boolvec<double,2>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+#ifdef __SSE4_1__
+    return _mm_blendv_pd(y.v, x.v, v);
+#else
+    return (( -convert_int() & x.as_int()) |
+            (~-convert_int() & y.as_int())).as_float();
+#endif
+  }
+  
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<double,2>::as_float() const -> realvec_t
+  {
+    return _mm_castsi128_pd(v);
+  }
+  
+  inline auto intvec<double,2>::convert_float() const -> realvec_t
+  {
+    return MF::vml_convert_float(*this);
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_SSE_DOUBLE2_H
diff --git a/vec_sse_float1.h b/vec_sse_float1.h
new file mode 100644
index 0000000..9e3d12e
--- /dev/null
+++ b/vec_sse_float1.h
@@ -0,0 +1,523 @@
+// -*-C++-*-
+
+#ifndef VEC_SSE2_FLOAT1_H
+#define VEC_SSE2_FLOAT1_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// SSE2 intrinsics
+#include <emmintrin.h>
+#ifdef __SSE3__                 // Intel's SSE 3
+#  include <pmmintrin.h>
+#endif
+#ifdef __SSE4_1__               // Intel's SSE 4.1
+#  include <smmintrin.h>
+#endif
+#ifdef __SSE4A__                // AMD's SSE 4a
+#  include <ammintrin.h>
+#endif
+#if defined __AVX__             // Intel's AVX
+#  include <immintrin.h>
+#endif
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_FLOAT_1
+  template<> struct boolvec<float,1>;
+  template<> struct intvec<float,1>;
+  template<> struct realvec<float,1>;
+  
+  
+  
+  template<>
+  struct boolvec<float,1>: floatprops<float>
+  {
+    static int const size = 1;
+    typedef bool scalar_t;
+    typedef uint_t bvector_t;
+    static int const alignment = sizeof(bvector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+    // true values are non-zero, false values are zero
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a): v(a) {}
+    // TODO: remove this
+    boolvec(int x): v(x) {}
+    boolvec(bool const* as): v(as[0]) {}
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return v; }
+    boolvec& set_elt(int n, bool a) { return v=a, *this; }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return !v; }
+    
+    boolvec operator&&(boolvec x) const { return v && x.v; }
+    boolvec operator||(boolvec x) const { return v || x.v; }
+    boolvec operator==(boolvec x) const { return bool(v) == bool(x.v); }
+    boolvec operator!=(boolvec x) const { return bool(v) != bool(x.v); }
+    
+    bool all() const { return v; }
+    bool any() const { return v; }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<float,1>: floatprops<float>
+  {
+    static int const size = 1;
+    typedef int_t scalar_t;
+    typedef int_t ivector_t;
+    static int const alignment = sizeof(ivector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(int_t a): v(a) {}
+    intvec(int_t const* as): v(as[0]) {}
+    static intvec iota() { return intvec(I(0)); }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return v; }
+    intvec& set_elt(int n, int_t a) { return v=a, *this; }
+    
+    
+    
+    boolvec_t as_bool() const { return U(v); }
+    boolvec_t convert_bool() const { return bool(v); }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    intvec operator+() const { return +v; }
+    intvec operator-() const { return -v; }
+    
+    intvec operator+(intvec x) const { return v+x.v; }
+    intvec operator-(intvec x) const { return v-x.v; }
+    intvec operator*(intvec x) const { return v*x.v; }
+    intvec operator/(intvec x) const { return v/x.v; }
+    intvec operator%(intvec x) const { return v%x.v; }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    intvec& operator*=(intvec const& x) { return *this=*this*x; }
+    intvec& operator/=(intvec const& x) { return *this=*this/x; }
+    intvec& operator%=(intvec const& x) { return *this=*this%x; }
+    
+    
+    
+    intvec operator~() const { return ~v; }
+    
+    intvec operator&(intvec x) const { return v&x.v; }
+    intvec operator|(intvec x) const { return v|x.v; }
+    intvec operator^(intvec x) const { return v^x.v; }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const { return U(v) >> U(n); }
+    intvec operator>>(int_t n) const { return v>>n; }
+    intvec operator<<(int_t n) const { return v<<n; }
+    
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const { return U(v) >> U(n); }
+    intvec operator>>(intvec n) const { return v>>n; }
+    intvec operator<<(intvec n) const { return v<<n; }
+    
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t signbit() const
+    {
+      return *this < IV(I(0));
+    }
+    
+    boolvec_t operator==(intvec const& x) const { return v==x.v; }
+    boolvec_t operator!=(intvec const& x) const { return v!=x.v; }
+    boolvec_t operator<(intvec const& x) const { return v<x.v; }
+    boolvec_t operator<=(intvec const& x) const { return v<=x.v; }
+    boolvec_t operator>(intvec const& x) const { return v>x.v; }
+    boolvec_t operator>=(intvec const& x) const { return v>=x.v; }
+  };
+  
+  
+  
+  template<>
+  struct realvec<float,1>: floatprops<float>
+  {
+    static int const size = 1;
+    typedef real_t scalar_t;
+    typedef float vector_t;
+    static int const alignment = sizeof(vector_t);
+    
+    static char const* name() { return "<SSE2:1*float>"; }
+    void barrier() { __asm__("": "+x" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+  private:
+    static __m128 from_float(float a) { return _mm_set_ss(a); }
+    static float to_float(__m128 a) { return _mm_cvtss_f32(a); }
+  public:
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(real_t a): v(a) {}
+    realvec(real_t const* as): v(as[0]) {}
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return v; }
+    realvec& set_elt(int n, real_t a) { return v=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return *p;
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return *p;
+    }
+    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return loada(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return *this;
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return *this;
+      }
+    }
+    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return loada(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      *p = v;
+    }
+    void storeu(real_t* p) const
+    {
+      *p = v;
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      storea(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      }
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      storea(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return floatprops::as_int(v); }
+    intvec_t convert_int() const {
+      // return floatprops::convert_int(v);
+      return _mm_cvttss_si32(_mm_set_ss(v));
+    }
+    
+    
+    
+    realvec operator+() const { return +v; }
+    realvec operator-() const { return -v; }
+    
+    realvec operator+(realvec x) const { return v+x.v; }
+    realvec operator-(realvec x) const { return v-x.v; }
+    realvec operator*(realvec x) const { return v*x.v; }
+    realvec operator/(realvec x) const { return v/x.v; }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const { return v; }
+    real_t sum() const { return v; }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const { return v==x.v; }
+    boolvec_t operator!=(realvec const& x) const { return v!=x.v; }
+    boolvec_t operator<(realvec const& x) const { return v<x.v; }
+    boolvec_t operator<=(realvec const& x) const { return v<=x.v; }
+    boolvec_t operator>(realvec const& x) const { return v>x.v; }
+    boolvec_t operator>=(realvec const& x) const { return v>=x.v; }
+    
+    
+    
+    realvec acos() const { return MF::vml_acos(*this); }
+    realvec acosh() const { return MF::vml_acosh(*this); }
+    realvec asin() const { return MF::vml_asin(*this); }
+    realvec asinh() const { return MF::vml_asinh(*this); }
+    realvec atan() const { return MF::vml_atan(*this); }
+    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+    realvec atanh() const { return MF::vml_atanh(*this); }
+    realvec cbrt() const { return MF::vml_cbrt(*this); }
+    realvec ceil() const
+    {
+#ifdef __SSE4_1__
+      return to_float(_mm_ceil_ss(from_float(v), from_float(v)));
+#else
+      return std::ceil(v);
+#endif
+    }
+    realvec copysign(realvec y) const { return std::copysign(v, y.v); }
+    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cosh() const { return MF::vml_cosh(*this); }
+    realvec exp() const { return MF::vml_exp(*this); }
+    realvec exp10() const { return MF::vml_exp10(*this); }
+    realvec exp2() const { return MF::vml_exp2(*this); }
+    realvec expm1() const { return MF::vml_expm1(*this); }
+    realvec fabs() const { return std::fabs(v); }
+    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+    realvec floor() const
+    {
+#ifdef __SSE4_1__
+      return to_float(_mm_floor_ss(from_float(v), from_float(v)));
+#else
+      return std::floor(v);
+#endif
+    }
+    realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); }
+    realvec fmax(realvec y) const
+    {
+      return to_float(_mm_max_ss(from_float(v), from_float(y.v)));
+    }
+    realvec fmin(realvec y) const
+    {
+      return to_float(_mm_min_ss(from_float(v), from_float(y.v)));
+    }
+    realvec fmod(realvec y) const { return std::fmod(v, y.v); }
+    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+    intvec_t ilogb() const
+    {
+      int_t r = std::ilogb(v);
+      if (r == FP_ILOGB0) r = numeric_limits<int_t>::min();
+      else if (r == FP_ILOGBNAN) r = numeric_limits<int_t>::max();
+      return r;
+    }
+    boolvec_t isfinite() const { return std::isfinite(v); }
+    boolvec_t isinf() const { return std::isinf(v); }
+    boolvec_t isnan() const
+    {
+      return _mm_ucomineq_ss(from_float(v), from_float(v));
+    }
+    boolvec_t isnormal() const { return std::isnormal(v); }
+    realvec ldexp(int_t n) const { return std::ldexp(v, n); }
+    realvec ldexp(intvec_t n) const { return std::ldexp(v, n); }
+    realvec log() const { return MF::vml_log(*this); }
+    realvec log10() const { return MF::vml_log10(*this); }
+    realvec log1p() const { return MF::vml_log1p(*this); }
+    realvec log2() const { return MF::vml_log2(*this); }
+    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+    realvec rcp() const { return R(1.0)/v; }
+    realvec remainder(realvec y) const { return std::remainder(v, y.v); }
+    realvec rint() const
+    {
+#ifdef __SSE4_1__
+      return to_float(_mm_round_ss(from_float(v), from_float(v),
+                                   _MM_FROUND_TO_NEAREST_INT));
+#else
+      return MF::vml_rint(*this);
+#endif
+    }
+    realvec round() const { return MF::vml_round(*this); }
+    realvec rsqrt() const { return MF::vml_rsqrt(*this); }
+    boolvec_t signbit() const { return std::signbit(v); }
+    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sinh() const { return MF::vml_sinh(*this); }
+    // realvec sqrt1() const { return std::sqrt(v); }
+    realvec sqrt() const { return to_float(_mm_sqrt_ss(from_float(v))); }
+    realvec tan() const { return MF::vml_tan(*this); }
+    realvec tanh() const { return MF::vml_tanh(*this); }
+    realvec trunc() const
+    {
+#ifdef __SSE4_1__
+      return to_float(_mm_round_ss(from_float(v), from_float(v),
+                                   _MM_FROUND_TO_ZERO));
+#else
+      return MF::vml_trunc(*this);
+#endif
+    }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<float,1>::as_int() const -> intvec_t
+  {
+    return I(v);
+  }
+  
+  inline
+  auto boolvec<float,1>::convert_int() const -> intvec_t
+  {
+    return v;
+  }
+  
+  inline
+  auto boolvec<float,1>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return v ? x : y;
+  }
+  
+  inline
+  auto boolvec<float,1>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return v ? x : y;
+  }
+  
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<float,1>::as_float() const -> realvec_t
+  {
+    return FP::as_float(v);
+  }
+  
+  inline auto intvec<float,1>::convert_float() const -> realvec_t
+  {
+    // return FP::convert_float(v);
+    return _mm_cvtss_f32(_mm_cvtsi32_ss(_mm_setzero_ps(), v));
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_SSE2_FLOAT1_H
diff --git a/vec_sse_float4.h b/vec_sse_float4.h
new file mode 100644
index 0000000..5259cb2
--- /dev/null
+++ b/vec_sse_float4.h
@@ -0,0 +1,651 @@
+// -*-C++-*-
+
+#ifndef VEC_SSE_FLOAT4_H
+#define VEC_SSE_FLOAT4_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// SSE2 intrinsics
+#include <xmmintrin.h>
+#ifdef __SSE3__                 // Intel's SSE 3
+#  include <pmmintrin.h>
+#endif
+#if defined __SSE4_1__          // Intel's SSE 4.1
+#  include <smmintrin.h>
+#endif
+#if defined __SSE4A__           // AMD's SSE 4a
+#  include <ammintrin.h>
+#endif
+#if defined __AVX__             // Intel's AVX
+#  include <immintrin.h>
+#endif
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_FLOAT_4
+  template<> struct boolvec<float,4>;
+  template<> struct intvec<float,4>;
+  template<> struct realvec<float,4>;
+  
+  
+  
+  template<>
+  struct boolvec<float,4>: floatprops<float>
+  {
+    static int const size = 4;
+    typedef bool scalar_t;
+    typedef __m128 bvector_t;
+    static int const alignment = sizeof(bvector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // true values have the sign bit set, false values have it unset
+    static uint_t from_bool(bool a) { return - int_t(a); }
+    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a):
+    v(_mm_castsi128_ps(_mm_set1_epi32(from_bool(a)))) {}
+    boolvec(bool const* as):
+    v(_mm_castsi128_ps(_mm_set_epi32(from_bool(as[3]),
+                                     from_bool(as[2]),
+                                     from_bool(as[1]),
+                                     from_bool(as[0])))) {}
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+    boolvec& set_elt(int n, bool a)
+    {
+      return ((uint_t*)&v)[n]=from_bool(a), *this;
+    }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return _mm_xor_ps(boolvec(true), v); }
+    
+    boolvec operator&&(boolvec x) const { return _mm_and_ps(v, x.v); }
+    boolvec operator||(boolvec x) const { return _mm_or_ps(v, x.v); }
+    boolvec operator==(boolvec x) const { return !(*this!=x); }
+    boolvec operator!=(boolvec x) const { return _mm_xor_ps(v, x.v); }
+    
+    bool all() const
+    {
+      // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+#if defined __AVX__
+      return ! (! *this).any();
+#else
+      boolvec x = *this;
+      x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(1,0,3,2));
+      x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1));
+      return x[0];
+#endif
+    }
+    bool any() const
+    {
+      // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+#if defined __AVX__
+      return ! _mm_testz_ps(v, v);
+#else
+      boolvec x = *this;
+      x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(1,0,3,2));
+      x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1));
+      return x[0];
+#endif
+    }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<float,4>: floatprops<float>
+  {
+    static int const size = 4;
+    typedef int_t scalar_t;
+    typedef __m128i ivector_t;
+    static int const alignment = sizeof(ivector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(_mm_set1_epi32(a)) {}
+    intvec(int_t const* as): v(_mm_set_epi32(as[3], as[2], as[1], as[0])) {}
+    static intvec iota() { return _mm_set_epi32(3, 2, 1, 0); }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+    
+    
+    
+    boolvec_t as_bool() const { return _mm_castsi128_ps(v); }
+    boolvec_t convert_bool() const
+    {
+      // Result: convert_bool(0)=false, convert_bool(else)=true
+      return ! IV(_mm_cmpeq_epi32(v, IV(0))).as_bool();
+    }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    // Note: not all arithmetic operations are supported!
+    
+    intvec operator+() const { return *this; }
+    intvec operator-() const { return IV(0) - *this; }
+    
+    intvec operator+(intvec x) const { return _mm_add_epi32(v, x.v); }
+    intvec operator-(intvec x) const { return _mm_sub_epi32(v, x.v); }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    
+    
+    
+    intvec operator~() const { return IV(~U(0)) ^ *this; }
+    
+    intvec operator&(intvec x) const
+    {
+      return _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v),
+                                         _mm_castsi128_ps(x.v)));
+    }
+    intvec operator|(intvec x) const
+    {
+      return _mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(v),
+                                        _mm_castsi128_ps(x.v)));
+    }
+    intvec operator^(intvec x) const
+    {
+      return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(v),
+                                         _mm_castsi128_ps(x.v)));
+    }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const { return _mm_srli_epi32(v, n); }
+    intvec operator>>(int_t n) const { return _mm_srai_epi32(v, n); }
+    intvec operator<<(int_t n) const { return _mm_slli_epi32(v, n); }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, U((*this)[i]) >> U(n[i]));
+      }
+      return r;
+    }
+    intvec operator>>(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] >> n[i]);
+      }
+      return r;
+    }
+    intvec operator<<(intvec n) const
+    {
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] << n[i]);
+      }
+      return r;
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t signbit() const
+    {
+      return as_bool();
+    }
+    
+    boolvec_t operator==(intvec const& x) const
+    {
+      return ! (*this != x);
+    }
+    boolvec_t operator!=(intvec const& x) const
+    {
+      return (*this ^ x).convert_bool();
+    }
+    boolvec_t operator<(intvec const& x) const
+    {
+      // return (*this - x).as_bool();
+      boolvec_t r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] < x[i]);
+      }
+      return r;
+    }
+    boolvec_t operator<=(intvec const& x) const
+    {
+      return ! (*this > x);
+    }
+    boolvec_t operator>(intvec const& x) const
+    {
+      return x < *this;
+    }
+    boolvec_t operator>=(intvec const& x) const
+    {
+      return ! (*this < x);
+    }
+  };
+  
+  
+  
+  template<>
+  struct realvec<float,4>: floatprops<float>
+  {
+    static int const size = 4;
+    typedef real_t scalar_t;
+    typedef __m128 vector_t;
+    static int const alignment = sizeof(vector_t);
+    
+    static char const* name() { return "<SSE2:4*float>"; }
+    void barrier() { __asm__("": "+x" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(_mm_set1_ps(a)) {}
+    realvec(real_t const* as): v(_mm_set_ps(as[3], as[2], as[1], as[0])) {}
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return _mm_load_ps(p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      return _mm_loadu_ps(p);
+    }
+    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff);
+      if (ioff==0) return loada(p);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      _mm_store_ps(p, v);
+    }
+    void storeu(real_t* p) const
+    {
+      return _mm_storeu_ps(p, v);
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+#if defined __AVX__
+        _mm_maskstore_ps(p, m.m.as_int(), v);
+#else
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+        if (m.m[2]) p[2] = (*this)[2];
+        if (m.m[3]) p[3] = (*this)[3];
+#endif
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+        if (m.m[2]) p[2] = (*this)[2];
+        if (m.m[3]) p[3] = (*this)[3];
+      }
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return _mm_castps_si128(v); }
+    intvec_t convert_int() const { return _mm_cvttps_epi32(v); }
+    
+    
+    
+    realvec operator+() const { return *this; }
+    realvec operator-() const { return RV(0.0) - *this; }
+    
+    realvec operator+(realvec x) const { return _mm_add_ps(v, x.v); }
+    realvec operator-(realvec x) const { return _mm_sub_ps(v, x.v); }
+    realvec operator*(realvec x) const { return _mm_mul_ps(v, x.v); }
+    realvec operator/(realvec x) const { return _mm_div_ps(v, x.v); }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const
+    {
+      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+    }
+    real_t sum() const
+    {
+#ifdef __SSE3__
+      realvec x = *this;
+      x = _mm_hadd_ps(x.v, x.v);
+      x = _mm_hadd_ps(x.v, x.v);
+      return x[0];
+#else
+      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+#endif
+    }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const
+    {
+      return _mm_cmpeq_ps(v, x.v);
+    }
+    boolvec_t operator!=(realvec const& x) const
+    {
+      return _mm_cmpneq_ps(v, x.v);
+    }
+    boolvec_t operator<(realvec const& x) const
+    {
+      return _mm_cmplt_ps(v, x.v);
+    }
+    boolvec_t operator<=(realvec const& x) const
+    {
+      return _mm_cmple_ps(v, x.v);
+    }
+    boolvec_t operator>(realvec const& x) const
+    {
+      return _mm_cmpgt_ps(v, x.v);
+    }
+    boolvec_t operator>=(realvec const& x) const
+    {
+      return _mm_cmpge_ps(v, x.v);
+    }
+    
+    
+    
+    realvec acos() const { return MF::vml_acos(*this); }
+    realvec acosh() const { return MF::vml_acosh(*this); }
+    realvec asin() const { return MF::vml_asin(*this); }
+    realvec asinh() const { return MF::vml_asinh(*this); }
+    realvec atan() const { return MF::vml_atan(*this); }
+    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+    realvec atanh() const { return MF::vml_atanh(*this); }
+    realvec cbrt() const { return MF::vml_cbrt(*this); }
+    realvec ceil() const
+    {
+#ifdef __SSE4_1__
+      return _mm_ceil_ps(v);
+#else
+      return MF::vml_ceil(*this);
+#endif
+    }
+    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cosh() const { return MF::vml_cosh(*this); }
+    realvec exp() const { return MF::vml_exp(*this); }
+    realvec exp10() const { return MF::vml_exp10(*this); }
+    realvec exp2() const { return MF::vml_exp2(*this); }
+    realvec expm1() const { return MF::vml_expm1(*this); }
+    realvec fabs() const { return MF::vml_fabs(*this); }
+    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+    realvec floor() const
+    {
+#ifdef __SSE4_1__
+      return _mm_floor_ps(v);
+#else
+      return MF::vml_floor(*this);
+#endif
+    }
+    realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); }
+    realvec fmax(realvec y) const { return _mm_max_ps(v, y.v); }
+    realvec fmin(realvec y) const { return _mm_min_ps(v, y.v); }
+    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+    boolvec_t isinf() const { return MF::vml_isinf(*this); }
+    boolvec_t isnan() const { return _mm_cmpunord_ps(v, v);; }
+    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec log() const { return MF::vml_log(*this); }
+    realvec log10() const { return MF::vml_log10(*this); }
+    realvec log1p() const { return MF::vml_log1p(*this); }
+    realvec log2() const { return MF::vml_log2(*this); }
+    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+    realvec rcp() const
+    {
+      realvec x = *this;
+      realvec r = _mm_rcp_ps(x); // this is only an approximation
+      r *= RV(2.0) - r*x;        // one Newton iteration (see vml_rcp)
+      return r;
+    }
+    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+    realvec rint() const
+    {
+#ifdef __SSE4_1__
+      return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
+#else
+      return MF::vml_rint(*this);
+#endif
+    }
+    realvec round() const { return MF::vml_round(*this); }
+    realvec rsqrt() const
+    {
+      realvec x = *this;
+      realvec r = _mm_rsqrt_ps(x);    // this is only an approximation
+      r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt)
+      return r;
+    }
+    boolvec_t signbit() const { return v; }
+    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sinh() const { return MF::vml_sinh(*this); }
+    realvec sqrt() const { return _mm_sqrt_ps(v); }
+    realvec tan() const { return MF::vml_tan(*this); }
+    realvec tanh() const { return MF::vml_tanh(*this); }
+    realvec trunc() const
+    {
+#ifdef __SSE4_1__
+      return _mm_round_ps(v, _MM_FROUND_TO_ZERO);
+#else
+      return MF::vml_trunc(*this);
+#endif
+    }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<float,4>::as_int() const -> intvec_t
+  {
+    return _mm_castps_si128(v);
+  }
+  
+  inline
+  auto boolvec<float,4>::convert_int() const -> intvec_t
+  {
+    return lsr(as_int(), bits-1);
+  }
+  
+  inline
+  auto boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return ifthen(x.as_float(), y.as_float()).as_int();
+  }
+  
+  inline
+  auto boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+#ifdef __SSE4_1__
+    return _mm_blendv_ps(y.v, x.v, v);
+#else
+    return (( -convert_int() & x.as_int()) |
+            (~-convert_int() & y.as_int())).as_float();
+#endif
+  }
+
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<float,4>::as_float() const -> realvec_t
+  {
+    return _mm_castsi128_ps(v);
+  }
+  
+  inline auto intvec<float,4>::convert_float() const -> realvec_t
+  {
+    return _mm_cvtepi32_ps(v);
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_SSE_FLOAT4_H
diff --git a/vec_vsx_double2.h b/vec_vsx_double2.h
new file mode 100644
index 0000000..024056d
--- /dev/null
+++ b/vec_vsx_double2.h
@@ -0,0 +1,656 @@
+// -*-C++-*-
+
+#ifndef VEC_VSX_DOUBLE2_H
+#define VEC_VSX_DOUBLE2_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// VSX intrinsics
+#include <altivec.h>
+#undef vector
+#undef pixel
+#undef bool
+
+
+
+namespace vecmathlib {
+  
+#define VECMATHLIB_HAVE_VEC_DOUBLE_2
+  template<> struct boolvec<double,2>;
+  template<> struct intvec<double,2>;
+  template<> struct realvec<double,2>;
+  
+  
+  
+  template<>
+  struct boolvec<double,2>: floatprops<double>
+  {
+    static int const size = 2;
+    typedef bool scalar_t;
+    typedef __vector __bool long long bvector_t;
+    static int const alignment = sizeof(bvector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // true values are -1, false values are 0
+    // truth values are interpreted bit-wise
+    static uint_t from_bool(bool a) { return -int_t(a); }
+    static bool to_bool(uint_t a) { return a; }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // boolvec(boolvec const& x): v(x.v) {}
+    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a): v(vec_splats(from_bool(a))) {}
+    boolvec(bool const* as)
+    {
+      for (int d=0; d<size; ++d) set_elt(d, as[d]);
+    }
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+    boolvec& set_elt(int n, bool a)
+    {
+      return ((uint_t*)&v)[n]=from_bool(a), *this;
+    }
+    
+    
+    
+    intvec_t as_int() const;      // defined after intvec
+    intvec_t convert_int() const; // defined after intvec
+    
+    
+    
+    boolvec operator!() const
+    {
+      return
+	(__vector __bool long long)(__vector long long)
+	vec_nor((__vector double)(__vector long long)v,
+		(__vector double)(__vector long long)v);
+    }
+    
+    boolvec operator&&(boolvec x) const
+    {
+      return
+	(__vector __bool long long)(__vector long long)
+	vec_and((__vector double)(__vector long long)v,
+		(__vector double)(__vector long long)x.v);
+    }
+    boolvec operator||(boolvec x) const
+    {
+      return
+	(__vector __bool long long)(__vector long long)
+	vec_or((__vector double)(__vector long long)v,
+	       (__vector double)(__vector long long)x.v);
+    }
+    boolvec operator==(boolvec x) const { return !(*this!=x); }
+    boolvec operator!=(boolvec x) const
+    {
+      return
+	(__vector __bool long long)(__vector long long)
+	vec_xor((__vector double)(__vector long long)v,
+		(__vector double)(__vector long long)x.v);
+    }
+    
+    bool all() const
+    {
+      return vec_all_ne((__vector int)v, (__vector int)BV(false).v);
+    }
+    bool any() const
+    {
+      return vec_any_ne((__vector int)v, (__vector int)BV(false).v);
+    }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+  };
+  
+  
+  
+  template<>
+  struct intvec<double,2>: floatprops<double>
+  {
+    static int const size = 2;
+    typedef int_t scalar_t;
+    typedef __vector long long ivector_t;
+    static int const alignment = sizeof(ivector_t);
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // intvec(intvec const& x): v(x.v) {}
+    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(vec_splats(a)) {}
+    intvec(int_t const* as)
+    {
+      for (int d=0; d<size; ++d) set_elt(d, as[d]);
+    }
+    static intvec iota() { return (__vector long long){0, 1}; }
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+    
+    
+    
+    // Vector casts do not change the bit battern
+    boolvec_t as_bool() const { return (__vector __bool long long)v; }
+    boolvec_t convert_bool() const { return *this != IV(I(0)); }
+    realvec_t as_float() const;      // defined after realvec
+    realvec_t convert_float() const; // defined after realvec
+    
+    
+    
+    // Permutation control words
+  private:
+    // 0123 4567 -> 1436
+    // exchange pairs
+    static __vector unsigned char perm_int_swap()
+    {
+      return
+	(__vector unsigned char)
+	{4,5,6,7, 16,17,18,19, 12,13,14,15, 24,25,26,27};
+    }
+    // 0123 4567 -> 0426
+    // broadcast high elements of pairs
+    static __vector unsigned char perm_int_bchi()
+    {
+      return
+	(__vector unsigned char)
+	{0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
+    }
+  public:
+    
+    
+
+    intvec operator+() const { return *this; }
+    intvec operator-() const { return IV(I(0)) - *this; }
+    
+    intvec operator+(intvec x) const
+    {
+      // return vec_add(v, x.v);
+      __vector unsigned int a = (__vector unsigned int)v;
+      __vector unsigned int b = (__vector unsigned int)x.v;
+      __vector unsigned int s = vec_add(a, b);
+      __vector unsigned int c = vec_addc(a, b);
+      __vector unsigned int z = vec_xor(z, z);
+      c = vec_perm(c, z, perm_int_swap());
+      s = vec_add(s, c);
+      return (__vector long long)s;
+    }
+    intvec operator-(intvec x) const
+    {
+      // return vec_sub(v, x.v);
+      __vector unsigned int a = (__vector unsigned int)v;
+      __vector unsigned int b = (__vector unsigned int)x.v;
+      __vector unsigned int d = vec_sub(a, b);
+      __vector unsigned int c = vec_subc(a, b);
+      c = vec_sub(vec_splats(1U), c);
+      __vector unsigned int z = vec_xor(z, z);
+      c = vec_perm(c, z, perm_int_swap());
+      d = vec_sub(d, c);
+      return (__vector long long)d;
+    }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    
+    
+    
+    intvec operator~() const
+    {
+      return (__vector long long)vec_nor((__vector int)v, (__vector int)v);
+    }
+    
+    intvec operator&(intvec x) const
+    {
+      return (__vector long long)vec_and((__vector int)v, (__vector int)x.v);
+    }
+    intvec operator|(intvec x) const
+    {
+      return (__vector long long)vec_or ((__vector int)v, (__vector int)x.v);
+    }
+    intvec operator^(intvec x) const
+    {
+      return (__vector long long)vec_xor((__vector int)v, (__vector int)x.v);
+    }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const { return lsr(IV(n)); }
+    intvec operator>>(int_t n) const { return *this >> IV(n); }
+    intvec operator<<(int_t n) const { return *this << IV(n); }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      // return vec_sr(v, (__vector unsigned long long)n.v);
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, U((*this)[i]) >> U(n[i]));
+      }
+      return r;
+    }
+    intvec operator>>(intvec n) const
+    {
+      // return vec_sra(v, (__vector unsigned long long)n.v);
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] >> n[i]);
+      }
+      return r;
+    }
+    intvec operator<<(intvec n) const
+    {
+      // return vec_sl(v, (__vector unsigned long long)n.v);
+      intvec r;
+      for (int i=0; i<size; ++i) {
+        r.set_elt(i, (*this)[i] << n[i]);
+      }
+      return r;
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+    
+    
+    
+    boolvec_t signbit() const
+    {
+      return (*this >> (bits-1)).as_bool();
+    }
+    
+    boolvec_t operator==(intvec const& x) const
+    {
+      // return vec_cmpeq(v, x.v);
+      __vector int a = (__vector int)v;
+      __vector int b = (__vector int)x.v;
+      __vector __bool int c = vec_cmpeq(a, b);
+      __vector __bool int cx = vec_perm(c, c, perm_int_swap());
+      __vector __bool int r = vec_and(c, cx);
+      return (__vector __bool long long)r;
+    }
+    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
+    boolvec_t operator<(intvec const& x) const
+    {
+      __vector int a = (__vector int)v;
+      __vector int b = (__vector int)x.v;
+      __vector __bool int lt = vec_cmplt(a, b);
+      __vector __bool int eq = vec_cmpeq(a, b);
+      __vector unsigned int ua = (__vector unsigned int)v;
+      __vector unsigned int ub = (__vector unsigned int)x.v;
+      __vector __bool int ult = vec_cmplt(ua, ub);
+      __vector __bool int ultx = vec_perm(ult, ult, perm_int_swap());
+      __vector __bool int r = vec_or(lt, vec_and(eq, ultx));
+      r = vec_perm(r, r, perm_int_bchi());
+      return (__vector __bool long long)r;
+    }
+    boolvec_t operator<=(intvec const& x) const
+    {
+      return ! (*this > x);
+    }
+    boolvec_t operator>(intvec const& x) const
+    {
+      return x < *this;
+    }
+    boolvec_t operator>=(intvec const& x) const
+    {
+      return ! (*this < x);
+    }
+  };
+  
+  
+  
+  template<>
+  struct realvec<double,2>: floatprops<double>
+  {
+    static int const size = 2;
+    typedef real_t scalar_t;
+    typedef __vector double vector_t;
+    static int const alignment = sizeof(vector_t);
+    
+    static char const* name() { return "<VSX:2*double>"; }
+    void barrier() { __asm__("": "+v" (v)); }
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    // Can't have a non-trivial copy constructor; if so, objects won't
+    // be passed in registers
+    // realvec(realvec const& x): v(x.v) {}
+    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(vec_splats(a)) {}
+    realvec(real_t const* as)
+    {
+      for (int d=0; d<size; ++d) set_elt(d, as[d]);
+    }
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+    
+    
+    
+    typedef vecmathlib::mask_t<realvec_t> mask_t;
+    
+    static realvec_t loada(real_t const* p)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      return vec_ld(0, (const __vector double*)p);
+    }
+    static realvec_t loadu(real_t const* p)
+    {
+      realvec_t v0 = vec_ld(0, (const __vector double*)p);
+      realvec_t v1 = vec_ld(15, (const __vector double*)p);
+      return vec_perm(v0.v, v1.v, vec_lvsl(0, p));
+    }
+    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff);
+      return loadu(p+ioff);
+    }
+    realvec_t loada(real_t const* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(all(m.m), true)) {
+        return loada(p);
+      } else {
+        return m.m.ifthen(loada(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        return loadu(p);
+      } else {
+        return m.m.ifthen(loadu(p), *this);
+      }
+    }
+    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return loada(p+ioff, m);
+      return loadu(p+ioff, m);
+    }
+    
+    void storea(real_t* p) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      vec_st(v, 0, (__vector double*)p);
+    }
+    void storeu(real_t* p) const
+    {
+      // Vector stores would require vector loads, which would need to
+      // be atomic
+      // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
+      p[0] = (*this)[0];
+      p[1] = (*this)[1];
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff);
+      storeu(p+ioff);
+    }
+    void storea(real_t* p, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (__builtin_expect(m.all_m, true)) {
+        storea(p);
+      } else {
+	// Use vec_ste?
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+      }
+    }
+    void storeu(real_t* p, mask_t const& m) const
+    {
+      if (__builtin_expect(m.all_m, true)) {
+        storeu(p);
+      } else {
+	// Use vec_ste?
+        if (m.m[0]) p[0] = (*this)[0];
+        if (m.m[1]) p[1] = (*this)[1];
+      }
+    }
+    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
+    {
+      VML_ASSERT(intptr_t(p) % alignment == 0);
+      if (ioff % realvec::size == 0) return storea(p+ioff, m);
+      storeu(p+ioff, m);
+    }
+    
+    
+    
+    intvec_t as_int() const { return (__vector long long) v; }
+    intvec_t convert_int() const { return MF::vml_convert_int(*this); }
+    
+    
+    
+    realvec operator+() const { return *this; }
+    realvec operator-() const { return RV(0.0) - *this; }
+    
+    realvec operator+(realvec x) const { return vec_add(v, x.v); }
+    realvec operator-(realvec x) const { return vec_sub(v, x.v); }
+    realvec operator*(realvec x) const { return vec_mul(v, x.v); }
+    realvec operator/(realvec x) const { return vec_div(v, x.v); }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const
+    {
+      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+    }
+    real_t sum() const
+    {
+      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+    }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); }
+    boolvec_t operator!=(realvec const& x) const { return ! (*this == x); }
+    boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); }
+    boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); }
+    boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); }
+    boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); }
+    
+    
+    
+    realvec acos() const { return MF::vml_acos(*this); }
+    realvec acosh() const { return MF::vml_acosh(*this); }
+    realvec asin() const { return MF::vml_asin(*this); }
+    realvec asinh() const { return MF::vml_asinh(*this); }
+    realvec atan() const { return MF::vml_atan(*this); }
+    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+    realvec atanh() const { return MF::vml_atanh(*this); }
+    realvec cbrt() const { return MF::vml_cbrt(*this); }
+    realvec ceil() const { return vec_ceil(v); }
+    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+    realvec cos() const { return MF::vml_cos(*this); }
+    realvec cosh() const { return MF::vml_cosh(*this); }
+    realvec exp() const { return MF::vml_exp(*this); }
+    realvec exp10() const { return MF::vml_exp10(*this); }
+    realvec exp2() const { return MF::vml_exp2(*this); }
+    realvec expm1() const { return MF::vml_expm1(*this); }
+    realvec fabs() const { return vec_abs(v); }
+    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+    realvec floor() const { return vec_floor(v); }
+    realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
+    realvec fmax(realvec y) const { return vec_max(v, y.v); }
+    realvec fmin(realvec y) const { return vec_min(v, y.v); }
+    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+    boolvec_t isinf() const { return MF::vml_isinf(*this); }
+    boolvec_t isnan() const { return MF::vml_isnan(*this); }
+    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+    realvec log() const { return MF::vml_log(*this); }
+    realvec log10() const { return MF::vml_log10(*this); }
+    realvec log1p() const { return MF::vml_log1p(*this); }
+    realvec log2() const { return MF::vml_log2(*this); }
+    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+    realvec rcp() const
+    {
+      realvec x = *this;
+      realvec r = vec_re(v);    // this is only an approximation
+      // TODO: use fma
+      // Note: don't rewrite this expression, this may introduce
+      // cancellation errors
+      r += r * (RV(1.0) - x*r); // two Newton iterations (see vml_rcp)
+      r += r * (RV(1.0) - x*r);
+      return r;
+    }
+    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+    realvec rint() const { return vec_rint(v); }
+    realvec round() const { return MF::vml_round(*this); }
+    realvec rsqrt() const
+    {
+      // realvec x = *this;
+      // realvec r = vec_rsqrte(x.v); // this is only an approximation
+      // // TODO: use fma
+      // // one Newton iteration (see vml_rsqrt)
+      // r += RV(0.5)*r * (RV(1.0) - x * r*r);
+      // return r;
+      return vec_rsqrt(v);
+    }
+    boolvec_t signbit() const { return MF::vml_signbit(*this); }
+    realvec sin() const { return MF::vml_sin(*this); }
+    realvec sinh() const { return MF::vml_sinh(*this); }
+    // realvec sqrt() const { return *this * rsqrt(); }
+    realvec sqrt() const { return vec_sqrt(v); }
+    realvec tan() const { return MF::vml_tan(*this); }
+    realvec tanh() const { return MF::vml_tanh(*this); }
+    realvec trunc() const { return vec_trunc(v); }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<double,2>::as_int() const -> intvec_t
+  {
+    return (__vector long long) v;
+  }
+  
+  inline
+  auto boolvec<double,2>::convert_int() const -> intvec_t
+  {
+    return -(__vector long long)v;
+  }
+  
+  inline
+  auto boolvec<double,2>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return vec_sel(y.v, x.v, v);
+  }
+  
+  inline
+  auto boolvec<double,2>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return vec_sel(y.v, x.v, v);
+  }
+  
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<double,2>::as_float() const -> realvec_t
+  {
+    return (__vector double)v;
+  }
+  
+  inline auto intvec<double,2>::convert_float() const -> realvec_t
+  {
+    // return vec_ctd(v, 0);
+    return MF::vml_convert_float(*this);
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_VSX_DOUBLE2_H
diff --git a/vecmathlib.h b/vecmathlib.h
index e79188a..3e04b19 100644
--- a/vecmathlib.h
+++ b/vecmathlib.h
@@ -53,35 +53,38 @@ namespace std { class type_info; }
 
 #if defined __ARM_PCS_VFP       // ARM NEON
 // TODO: VFP
-#  include "vec_float_neon.h"
+// TODO: vec_neon_float4
+#  include "vec_neon_float2.h"
 #endif
 
 #if defined __SSE2__            // Intel SSE 2
-#  include "vec_float_sse2_scalar.h"
-#  include "vec_double_sse2_scalar.h"
-#  include "vec_float_sse2.h"
-#  include "vec_double_sse2.h"
+#  include "vec_sse_float1.h"
+#  include "vec_sse_float4.h"
+#  include "vec_sse_double1.h"
+#  include "vec_sse_double2.h"
 #endif
 
 #if defined __AVX__             // Intel AVX
-#  include "vec_fp8_avx.h"
-#  include "vec_fp16_avx.h"
-#  include "vec_float_avx.h"
-#  include "vec_double_avx.h"
+#  include "vec_avx_fp8_32.h"
+#  include "vec_avx_fp16_16.h"
+#  include "vec_avx_float8.h"
+#  include "vec_avx_double4.h"
 #endif
 
+// TODO: MIC
+
 #if defined __ALTIVEC__         // IBM Altivec
-#  include "vec_float_altivec.h"
+#  include "vec_altivec_float4.h"
 #endif
 #if defined __VSX__             // IBM VSX
-#  include "vec_double_vsx.h"
+#  include "vec_vsx_double2.h"
 #endif
 
 // TODO: IBM Blue Gene/P DoubleHummer
 
 #if defined __bgq__ && defined __VECTOR4DOUBLE__ // IBM Blue Gene/Q QPX
-// TODO: vec_float_qpx
-#  include "vec_double_qpx.h"
+// TODO: vec_qpx_float4
+#  include "vec_qpx_double4.h"
 #endif
 
 #endif // #ifndef VECMATHLIB_H
-- 
cgit v1.1