44 files changed, 15431 insertions, 17970 deletions
diff --git a/bench.cc b/bench.cc
index e795985..ac3eb46 100644
--- a/bench.cc
+++ b/bench.cc
@@ -16,47 +16,38 @@
 using namespace std;
 using namespace vecmathlib;
 
-
-
 #ifndef __has_builtin
-#  define __has_builtin(x) 0 // Compatibility with non-clang compilers
+#define __has_builtin(x) 0 // Compatibility with non-clang compilers
 #endif
 
-
-
 typedef unsigned long long ticks;
-inline ticks getticks()
-{
+inline ticks getticks() {
 #if __has_builtin(__builtin_readcyclecounter)
   return __builtin_readcyclecounter();
 #elif defined __x86_64__
   ticks a, d;
-  asm volatile("rdtsc" : "=a" (a), "=d" (d));
+  asm volatile("rdtsc" : "=a"(a), "=d"(d));
   return a | (d << 32);
 #elif defined __powerpc__
   unsigned int tbl, tbu, tbu1;
   do {
-    asm volatile("mftbu %0": "=r"(tbu));
-    asm volatile("mftb %0": "=r"(tbl));
-    asm volatile("mftbu %0": "=r"(tbu1));
+    asm volatile("mftbu %0" : "=r"(tbu));
+    asm volatile("mftb %0" : "=r"(tbl));
+    asm volatile("mftbu %0" : "=r"(tbu1));
   } while (tbu != tbu1);
   return ((unsigned long long)tbu << 32) | tbl;
 #else
   timeval tv;
   gettimeofday(&tv, NULL);
   return 1000000ULL * tv.tv_sec + tv.tv_usec;
-  // timespec ts;
-  // clock_gettime(CLOCK_REALTIME, &ts);
-  // return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
+// timespec ts;
+// clock_gettime(CLOCK_REALTIME, &ts);
+// return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
 #endif
 }
-inline double elapsed(ticks t1, ticks t0)
-{
-  return t1-t0;
-}
+inline double elapsed(ticks t1, ticks t0) { return t1 - t0; }
 
-double get_sys_time()
-{
+double get_sys_time() {
   timeval tv;
   gettimeofday(&tv, NULL);
   return tv.tv_sec + 1.0e-6 * tv.tv_usec;
@@ -65,8 +56,7 @@ double get_sys_time()
   // return ts.tv_sec + 1.0e-9 * ts.tv_nsec;
 }
 
-double measure_tick()
-{
+double measure_tick() {
   ticks const rstart = getticks();
   double const wstart = get_sys_time();
   while (get_sys_time() - wstart < 0.1) {
@@ -74,124 +64,103 @@ double measure_tick()
   }
   ticks const rend = getticks();
   double const wend = get_sys_time();
-  assert(wend-wstart >= 0.09);
+  assert(wend - wstart >= 0.09);
   return (wend - wstart) / elapsed(rend, rstart);
 }
 
-
-
 double global_result = 0.0;
-template<typename realvec_t>
-void save_result(realvec_t result)
-{
-  for (int i=0; i<realvec_t::size; ++i) {
+template <typename realvec_t> void save_result(realvec_t result) {
+  for (int i = 0; i < realvec_t::size; ++i) {
     global_result += result[i];
   }
   // Check global accumulator to prevent optimisation
-  if (! vml_std::isfinite(global_result)) {
+  if (!vml_std::isfinite(global_result)) {
     cout << "\n"
          << "WARNING: Global accumulator is not finite\n";
   }
 }
 
+template <typename T> inline T nop(T x) { return x; }
 
+template <typename T> inline T fneg(T x) { return -x; }
 
-template<typename T> inline T nop(T x) { return x; }
-
-template<typename T> inline T fneg(T x) { return -x; }
+template <typename T> inline T fadd(T x, T y) { return x + y; }
+template <typename T> inline T fsub(T x, T y) { return x - y; }
+template <typename T> inline T fmul(T x, T y) { return x * y; }
+template <typename T> inline T fdiv(T x, T y) { return x / y; }
 
-template<typename T> inline T fadd(T x, T y) { return x+y; }
-template<typename T> inline T fsub(T x, T y) { return x-y; }
-template<typename T> inline T fmul(T x, T y) { return x*y; }
-template<typename T> inline T fdiv(T x, T y) { return x/y; }
-
-template<typename T> inline T frexp0(T x)
-{
+template <typename T> inline T frexp0(T x) {
   typename T::intvec_t ir;
   return frexp(x, &ir);
 }
-template<typename T> inline typename T::intvec_t frexp1(T x)
-{
+template <typename T> inline typename T::intvec_t frexp1(T x) {
   typename T::intvec_t ir;
   frexp(x, &ir);
   return ir;
 }
 
-template<typename T> inline T ldexps(T x, T y)
-{
+template <typename T> inline T ldexps(T x, T y) {
   typename T::intvec_t iy = convert_int(y);
   return ldexp(x, iy[0]);
 }
-template<typename T> inline T ldexpv(T x, T y)
-{
+template <typename T> inline T ldexpv(T x, T y) {
   typename T::intvec_t iy = convert_int(y);
   return ldexp(x, iy);
 }
 
-
-
-#define DECLARE_FUNCTOR(FUNC, XMIN, XMAX)                       \
-  template<typename T>                                          \
-  struct functor_##FUNC {                                       \
-    static typename T::real_t get_xmin() { return XMIN; }       \
-    static typename T::real_t get_xmax() { return XMAX; }       \
-    static const char* name() { return #FUNC; }                 \
-    T operator()(T x) {                                         \
-      return FUNC(x);                                           \
-    }                                                           \
+#define DECLARE_FUNCTOR(FUNC, XMIN, XMAX)                                      \
+  template <typename T> struct functor_##FUNC {                                \
+    static typename T::real_t get_xmin() { return XMIN; }                      \
+    static typename T::real_t get_xmax() { return XMAX; }                      \
+    static const char *name() { return #FUNC; }                                \
+    T operator()(T x) { return FUNC(x); }                                      \
   }
 
-#define DECLARE_BFUNCTOR(FUNC, XMIN, XMAX)                      \
-  template<typename T>                                          \
-  struct functor_##FUNC {                                       \
-    static typename T::real_t get_xmin() { return XMIN; }       \
-    static typename T::real_t get_xmax() { return XMAX; }       \
-    static const char* name() { return #FUNC; }                 \
-    T operator()(T x) {                                         \
-      typename T::boolvec_t res = FUNC(x);                      \
-      return convert_float(convert_int(res));                   \
-    }                                                           \
+#define DECLARE_BFUNCTOR(FUNC, XMIN, XMAX)                                     \
+  template <typename T> struct functor_##FUNC {                                \
+    static typename T::real_t get_xmin() { return XMIN; }                      \
+    static typename T::real_t get_xmax() { return XMAX; }                      \
+    static const char *name() { return #FUNC; }                                \
+    T operator()(T x) {                                                        \
+      typename T::boolvec_t res = FUNC(x);                                     \
+      return convert_float(convert_int(res));                                  \
+    }                                                                          \
   }
 
-#define DECLARE_IFUNCTOR(FUNC, XMIN, XMAX)                      \
-  template<typename T>                                          \
-  struct functor_##FUNC {                                       \
-    static typename T::real_t get_xmin() { return XMIN; }       \
-    static typename T::real_t get_xmax() { return XMAX; }       \
-    static const char* name() { return #FUNC; }                 \
-    T operator()(T x) {                                         \
-      typename T::intvec_t res = FUNC(x);                       \
-      return convert_float(res);                                \
-    }                                                           \
+#define DECLARE_IFUNCTOR(FUNC, XMIN, XMAX)                                     \
+  template <typename T> struct functor_##FUNC {                                \
+    static typename T::real_t get_xmin() { return XMIN; }                      \
+    static typename T::real_t get_xmax() { return XMAX; }                      \
+    static const char *name() { return #FUNC; }                                \
+    T operator()(T x) {                                                        \
+      typename T::intvec_t res = FUNC(x);                                      \
+      return convert_float(res);                                               \
+    }                                                                          \
   }
 
-#define DECLARE_FUNCTOR2(FUNC, XMIN, XMAX, YOFFSET)             \
-  template<typename T>                                          \
-  struct functor_##FUNC {                                       \
-    static typename T::real_t get_xmin() { return XMIN; }       \
-    static typename T::real_t get_xmax() { return XMAX; }       \
-    static const char* name() { return #FUNC; }                 \
-    T operator()(T x) {                                         \
-      const typename T::real_t yoffset = YOFFSET;               \
-      return FUNC(x, x + T(yoffset));                           \
-    }                                                           \
+#define DECLARE_FUNCTOR2(FUNC, XMIN, XMAX, YOFFSET)                            \
+  template <typename T> struct functor_##FUNC {                                \
+    static typename T::real_t get_xmin() { return XMIN; }                      \
+    static typename T::real_t get_xmax() { return XMAX; }                      \
+    static const char *name() { return #FUNC; }                                \
+    T operator()(T x) {                                                        \
+      const typename T::real_t yoffset = YOFFSET;                              \
+      return FUNC(x, x + T(yoffset));                                          \
+    }                                                                          \
   }
 
-#define DECLARE_FUNCTOR3(FUNC, XMIN, XMAX, YOFFSET, ZOFFSET)    \
-  template<typename T>                                          \
-  struct functor_##FUNC {                                       \
-    static typename T::real_t get_xmin() { return XMIN; }       \
-    static typename T::real_t get_xmax() { return XMAX; }       \
-    static const char* name() { return #FUNC; }                 \
-    T operator()(T x) {                                         \
-      const typename T::real_t yoffset = YOFFSET;               \
-      const typename T::real_t zoffset = ZOFFSET;               \
-      return FUNC(x, x + T(yoffset), x + T(zoffset));           \
-    }                                                           \
+#define DECLARE_FUNCTOR3(FUNC, XMIN, XMAX, YOFFSET, ZOFFSET)                   \
+  template <typename T> struct functor_##FUNC {                                \
+    static typename T::real_t get_xmin() { return XMIN; }                      \
+    static typename T::real_t get_xmax() { return XMAX; }                      \
+    static const char *name() { return #FUNC; }                                \
+    T operator()(T x) {                                                        \
+      const typename T::real_t yoffset = YOFFSET;                              \
+      const typename T::real_t zoffset = ZOFFSET;                              \
+      return FUNC(x, x + T(yoffset), x + T(zoffset));                          \
+    }                                                                          \
   }
 
-
-
 DECLARE_FUNCTOR(nop, 0.0, 1.0);
 
 DECLARE_FUNCTOR(fneg, 0.0, 1.0);
@@ -252,137 +221,127 @@ DECLARE_FUNCTOR(tan, 0.0, 1.0);
 DECLARE_FUNCTOR(tanh, -1.0, +1.0);
 DECLARE_FUNCTOR(trunc, -1.0, +1.0);
 
-
-
-template<typename realvec_t, template<typename> class func_t>
-double run_bench()
-{
+template <typename realvec_t, template <typename> class func_t>
+double run_bench() {
   const int numiters = 1000000;
-  
+
   typedef typename realvec_t::real_t real_t;
   const real_t xmin = func_t<realvec_t>::get_xmin();
   const real_t xmax = func_t<realvec_t>::get_xmax();
   realvec_t x0, dx;
-  for (int i=0; i<realvec_t::size; ++i) {
+  for (int i = 0; i < realvec_t::size; ++i) {
     x0.set_elt(i, xmin + (xmax - xmin) / numiters * i / realvec_t::size);
     dx.set_elt(i, (xmax - xmin) / numiters);
   }
   realvec_t x, y;
   ticks t0, t1;
   double const cycles_per_tick = 1.0; // measure_tick();
-  
+
   func_t<realvec_t> func;
   t0 = getticks();
   x = y = x0;
-  for (int n=0; n<numiters; ++n) {
+  for (int n = 0; n < numiters; ++n) {
     y += func(x);
     x += dx;
   }
   t1 = getticks();
   save_result(y);
-  
-  return cycles_per_tick * elapsed(t1,t0) * realvec_t::size / numiters;
+
+  return cycles_per_tick * elapsed(t1, t0) * realvec_t::size / numiters;
 }
 
-template<typename realvec_t, template<typename> class func_t>
-void bench_type_func()
-{
-  cout << "   "
-       << setw(-5) << func_t<realvec_t>::name() << " "
-       << setw(18) << realvec_t::name() << ": " << flush;
+template <typename realvec_t, template <typename> class func_t>
+void bench_type_func() {
+  cout << "   " << setw(-5) << func_t<realvec_t>::name() << " " << setw(18)
+       << realvec_t::name() << ": " << flush;
   double const cycles = run_bench<realvec_t, func_t>();
   cout << cycles << " cycles\n" << flush;
 }
 
-template<template<typename> class func_t>
-void bench_func()
-{
+template <template <typename> class func_t> void bench_func() {
   cout << "\n"
        << "Benchmarking " << func_t<float32_vec>().name() << ":\n";
-  
+
   // Note: We benchmark neither testvec (since this is known to be
   // slow), nor builtinvec (since this has about the same performance
   // as pseudovec, and is also not very efficient).
-  
-  bench_type_func<realpseudovec<float,1>, func_t>();
+
+  bench_type_func<realpseudovec<float, 1>, func_t>();
 #ifdef __clang__
-  bench_type_func<realbuiltinvec<float,1>, func_t>();
+  bench_type_func<realbuiltinvec<float, 1>, func_t>();
 #endif
-  bench_type_func<realtestvec<float,1>, func_t>();
+  bench_type_func<realtestvec<float, 1>, func_t>();
 #ifdef VECMATHLIB_HAVE_VEC_FLOAT_1
-  bench_type_func<realvec<float,1>, func_t>();
+  bench_type_func<realvec<float, 1>, func_t>();
 #endif
 #ifdef VECMATHLIB_HAVE_VEC_FLOAT_2
-  bench_type_func<realpseudovec<float,2>, func_t>();
+  bench_type_func<realpseudovec<float, 2>, func_t>();
 #ifdef __clang__
-  bench_type_func<realbuiltinvec<float,2>, func_t>();
+  bench_type_func<realbuiltinvec<float, 2>, func_t>();
 #endif
   // bench_type_func<realtestvec<float,2>, func_t>();
-  bench_type_func<realvec<float,2>, func_t>();
+  bench_type_func<realvec<float, 2>, func_t>();
 #endif
 #ifdef VECMATHLIB_HAVE_VEC_FLOAT_4
-  bench_type_func<realpseudovec<float,4>, func_t>();
+  bench_type_func<realpseudovec<float, 4>, func_t>();
 #ifdef __clang__
-  bench_type_func<realbuiltinvec<float,4>, func_t>();
+  bench_type_func<realbuiltinvec<float, 4>, func_t>();
 #endif
   // bench_type_func<realtestvec<float,4>, func_t>();
-  bench_type_func<realvec<float,4>, func_t>();
+  bench_type_func<realvec<float, 4>, func_t>();
 #endif
 #ifdef VECMATHLIB_HAVE_VEC_FLOAT_8
-  bench_type_func<realpseudovec<float,8>, func_t>();
+  bench_type_func<realpseudovec<float, 8>, func_t>();
 #ifdef __clang__
-  bench_type_func<realbuiltinvec<float,8>, func_t>();
+  bench_type_func<realbuiltinvec<float, 8>, func_t>();
 #endif
   // bench_type_func<realtestvec<float,8>, func_t>();
-  bench_type_func<realvec<float,8>, func_t>();
+  bench_type_func<realvec<float, 8>, func_t>();
 #endif
-  
-  bench_type_func<realpseudovec<double,1>, func_t>();
+
+  bench_type_func<realpseudovec<double, 1>, func_t>();
 #ifdef __clang__
-  bench_type_func<realbuiltinvec<double,1>, func_t>();
+  bench_type_func<realbuiltinvec<double, 1>, func_t>();
 #endif
-  bench_type_func<realtestvec<double,1>, func_t>();
+  bench_type_func<realtestvec<double, 1>, func_t>();
 #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_1
-  bench_type_func<realvec<double,1>, func_t>();
+  bench_type_func<realvec<double, 1>, func_t>();
 #endif
 #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_2
-  bench_type_func<realpseudovec<double,2>, func_t>();
+  bench_type_func<realpseudovec<double, 2>, func_t>();
 #ifdef __clang__
-  bench_type_func<realbuiltinvec<double,2>, func_t>();
+  bench_type_func<realbuiltinvec<double, 2>, func_t>();
 #endif
   // bench_type_func<realtestvec<double,2>, func_t>();
-  bench_type_func<realvec<double,2>, func_t>();
+  bench_type_func<realvec<double, 2>, func_t>();
 #endif
 #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_4
-  bench_type_func<realpseudovec<double,4>, func_t>();
+  bench_type_func<realpseudovec<double, 4>, func_t>();
 #ifdef __clang__
-  bench_type_func<realbuiltinvec<double,4>, func_t>();
+  bench_type_func<realbuiltinvec<double, 4>, func_t>();
 #endif
   // bench_type_func<realtestvec<double,4>, func_t>();
-  bench_type_func<realvec<double,4>, func_t>();
+  bench_type_func<realvec<double, 4>, func_t>();
 #endif
 #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_8
-  bench_type_func<realpseudovec<double,8>, func_t>();
+  bench_type_func<realpseudovec<double, 8>, func_t>();
 #ifdef __clang__
-  bench_type_func<realbuiltinvec<double,8>, func_t>();
+  bench_type_func<realbuiltinvec<double, 8>, func_t>();
 #endif
   // bench_type_func<realtestvec<double,8>, func_t>();
-  bench_type_func<realvec<double,8>, func_t>();
+  bench_type_func<realvec<double, 8>, func_t>();
 #endif
 }
 
-
-
-void bench()
-{
+void bench() {
   bench_func<functor_nop>();
-  
+
   bench_func<functor_fneg>();
   bench_func<functor_fadd>();
   bench_func<functor_fsub>();
   bench_func<functor_fmul>();
   bench_func<functor_fdiv>();
-  
+
   bench_func<functor_acos>();
   bench_func<functor_acosh>();
   bench_func<functor_asin>();
@@ -436,10 +395,7 @@ void bench()
   bench_func<functor_trunc>();
 }
 
-
-
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
   cout << "Benchmarking math functions:\n";
   bench();
   return 0;
diff --git a/example.cc b/example.cc
index c48ef67..427ec02 100644
--- a/example.cc
+++ b/example.cc
@@ -7,20 +7,18 @@
 using namespace std;
 using namespace vecmathlib;
 
-
-
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
   // Declare a double precision vector with an architecture-dependent
   // number of elements
   float64_vec x;
   // Set each element separately. This is inefficient and should be
   // avoided if possible, but we want to demonstrate it here anyway.
-  for (int i=0; i<float64_vec::size; ++i) x.set_elt(i, double(i));
+  for (int i = 0; i < float64_vec::size; ++i)
+    x.set_elt(i, double(i));
   float64_vec y = x + float64_vec(1.0);
   y = sqrt(y);
   float64_vec z = log(y);
-  
+
   // Boolean vectors are closely related to either double or float
   // vectors, thus we need to make a distinction
   bool64_vec b = x < y;
@@ -29,12 +27,12 @@ int main(int argc, char** argv)
   // corresponding to "float64_vec", and there is "int_vec"
   // correpsonding to "float_vec".
   int64_vec i = convert_int(y);
-  
+
   cout << "x=" << x << "\n";
   cout << "y=" << y << "\n";
   cout << "z=" << z << "\n";
   cout << "b=" << b << "\n";
   cout << "i=" << i << "\n";
-  
+
   return 0;
 }
diff --git a/example_float.cc b/example_float.cc
index fed91c7..4feea0e 100644
--- a/example_float.cc
+++ b/example_float.cc
@@ -7,20 +7,18 @@
 using namespace std;
 using namespace vecmathlib;
 
-
-
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
   // Declare a float precision vector with an architecture-dependent
   // number of elements
   float32_vec x;
   // Set each element separately. This is inefficient and should be
   // avoided if possible, but we want to demonstrate it here anyway.
-  for (int i=0; i<float32_vec::size; ++i) x.set_elt(i, float(i));
+  for (int i = 0; i < float32_vec::size; ++i)
+    x.set_elt(i, float(i));
   float32_vec y = x + float32_vec(1.0);
   y = sqrt(y);
   float32_vec z = log(y);
-  
+
   // Boolean vectors are closely related to either float or float
   // vectors, thus we need to make a distinction
   bool32_vec b = x < y;
@@ -29,12 +27,12 @@ int main(int argc, char** argv)
   // corresponding to "float32_vec", and there is "int_vec"
   // correpsonding to "float32_vec".
   int32_vec i = convert_int(y);
-  
+
   cout << "x=" << x << "\n";
   cout << "y=" << y << "\n";
   cout << "z=" << z << "\n";
   cout << "b=" << b << "\n";
   cout << "i=" << i << "\n";
-  
+
   return 0;
 }
diff --git a/floatbuiltins.h b/floatbuiltins.h
index ee076a2..a7dd6f1 100644
--- a/floatbuiltins.h
+++ b/floatbuiltins.h
@@ -6,323 +6,383 @@
 #if defined __clang__
 
 namespace vecmathlib {
-  
-  inline char builtin_abs(char x) { return __builtin_abs(x); }
-  inline short builtin_abs(short x) { return __builtin_abs(x); }
-  inline int builtin_abs(int x) { return __builtin_abs(x); }
-  inline long builtin_abs(long x) { return __builtin_labs(x); }
+
+inline char builtin_abs(char x) { return __builtin_abs(x); }
+inline short builtin_abs(short x) { return __builtin_abs(x); }
+inline int builtin_abs(int x) { return __builtin_abs(x); }
+inline long builtin_abs(long x) { return __builtin_labs(x); }
 #if __SIZEOF_LONG_LONG__
-  inline long long builtin_abs(long long x) { return __builtin_llabs(x); }
+inline long long builtin_abs(long long x) { return __builtin_llabs(x); }
 #endif
-  
-  inline unsigned char builtin_clz(unsigned char x) { return __builtin_clzs(x) - CHAR_BIT * (sizeof(unsigned short) - sizeof(unsigned char)); }
-  inline unsigned short builtin_clz(unsigned short x) { return __builtin_clzs(x); }
-  inline unsigned int builtin_clz(unsigned int x) { return __builtin_clz(x); }
-  inline unsigned long builtin_clz(unsigned long x) { return __builtin_clzl(x); }
+
+inline unsigned char builtin_clz(unsigned char x) {
+  return __builtin_clzs(x) -
+         CHAR_BIT * (sizeof(unsigned short) - sizeof(unsigned char));
+}
+inline unsigned short builtin_clz(unsigned short x) {
+  return __builtin_clzs(x);
+}
+inline unsigned int builtin_clz(unsigned int x) { return __builtin_clz(x); }
+inline unsigned long builtin_clz(unsigned long x) { return __builtin_clzl(x); }
 #if __SIZEOF_LONG_LONG__
-  inline unsigned long long builtin_clz(unsigned long long x) { return __builtin_clzll(x); }
+inline unsigned long long builtin_clz(unsigned long long x) {
+  return __builtin_clzll(x);
+}
 #endif
-  
-  inline unsigned char builtin_popcount(unsigned char x) { return __builtin_popcount(x); }
-  inline unsigned short builtin_popcount(unsigned short x) { return __builtin_popcount(x); }
-  inline unsigned int builtin_popcount(unsigned int x) { return __builtin_popcount(x); }
-  inline unsigned long builtin_popcount(unsigned long x) { return __builtin_popcountl(x); }
+
+inline unsigned char builtin_popcount(unsigned char x) {
+  return __builtin_popcount(x);
+}
+inline unsigned short builtin_popcount(unsigned short x) {
+  return __builtin_popcount(x);
+}
+inline unsigned int builtin_popcount(unsigned int x) {
+  return __builtin_popcount(x);
+}
+inline unsigned long builtin_popcount(unsigned long x) {
+  return __builtin_popcountl(x);
+}
 #if __SIZEOF_LONG_LONG__
-  inline unsigned long long builtin_popcount(unsigned long long x) { return __builtin_popcountll(x); }
+inline unsigned long long builtin_popcount(unsigned long long x) {
+  return __builtin_popcountll(x);
+}
 #endif
-  
-  
-  
-  inline float builtin_acos(float x) { return __builtin_acosf(x); }
-  inline double builtin_acos(double x) { return __builtin_acos(x); }
+
+inline float builtin_acos(float x) { return __builtin_acosf(x); }
+inline double builtin_acos(double x) { return __builtin_acos(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_acos(long double x) { return __builtin_acosl(x); }
+inline long double builtin_acos(long double x) { return __builtin_acosl(x); }
 #endif
-  
-  inline float builtin_acosh(float x) { return __builtin_acoshf(x); }
-  inline double builtin_acosh(double x) { return __builtin_acosh(x); }
+
+inline float builtin_acosh(float x) { return __builtin_acoshf(x); }
+inline double builtin_acosh(double x) { return __builtin_acosh(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_acosh(long double x) { return __builtin_acoshl(x); }
+inline long double builtin_acosh(long double x) { return __builtin_acoshl(x); }
 #endif
-  
-  inline float builtin_asin(float x) { return __builtin_asinf(x); }
-  inline double builtin_asin(double x) { return __builtin_asin(x); }
+
+inline float builtin_asin(float x) { return __builtin_asinf(x); }
+inline double builtin_asin(double x) { return __builtin_asin(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_asin(long double x) { return __builtin_asinl(x); }
+inline long double builtin_asin(long double x) { return __builtin_asinl(x); }
 #endif
-  
-  inline float builtin_asinh(float x) { return __builtin_asinhf(x); }
-  inline double builtin_asinh(double x) { return __builtin_asinh(x); }
+
+inline float builtin_asinh(float x) { return __builtin_asinhf(x); }
+inline double builtin_asinh(double x) { return __builtin_asinh(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_asinh(long double x) { return __builtin_asinhl(x); }
+inline long double builtin_asinh(long double x) { return __builtin_asinhl(x); }
 #endif
-  
-  inline float builtin_atan(float x) { return __builtin_atanf(x); }
-  inline double builtin_atan(double x) { return __builtin_atan(x); }
+
+inline float builtin_atan(float x) { return __builtin_atanf(x); }
+inline double builtin_atan(double x) { return __builtin_atan(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_atan(long double x) { return __builtin_atanl(x); }
+inline long double builtin_atan(long double x) { return __builtin_atanl(x); }
 #endif
-  
-  inline float builtin_atan2(float x, float y) { return __builtin_atan2f(x, y); }
-  inline double builtin_atan2(double x, double y) { return __builtin_atan2(x, y); }
+
+inline float builtin_atan2(float x, float y) { return __builtin_atan2f(x, y); }
+inline double builtin_atan2(double x, double y) {
+  return __builtin_atan2(x, y);
+}
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_atan2(long double x, long double y) { return __builtin_atan2l(x, y); }
+inline long double builtin_atan2(long double x, long double y) {
+  return __builtin_atan2l(x, y);
+}
 #endif
-  
-  inline float builtin_atanh(float x) { return __builtin_atanhf(x); }
-  inline double builtin_atanh(double x) { return __builtin_atanh(x); }
+
+inline float builtin_atanh(float x) { return __builtin_atanhf(x); }
+inline double builtin_atanh(double x) { return __builtin_atanh(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_atanh(long double x) { return __builtin_atanhl(x); }
+inline long double builtin_atanh(long double x) { return __builtin_atanhl(x); }
 #endif
-  
-  inline float builtin_cbrt(float x) { return __builtin_cbrtf(x); }
-  inline double builtin_cbrt(double x) { return __builtin_cbrt(x); }
+
+inline float builtin_cbrt(float x) { return __builtin_cbrtf(x); }
+inline double builtin_cbrt(double x) { return __builtin_cbrt(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_cbrt(long double x) { return __builtin_cbrtl(x); }
+inline long double builtin_cbrt(long double x) { return __builtin_cbrtl(x); }
 #endif
-  
-  inline float builtin_ceil(float x) { return __builtin_ceilf(x); }
-  inline double builtin_ceil(double x) { return __builtin_ceil(x); }
+
+inline float builtin_ceil(float x) { return __builtin_ceilf(x); }
+inline double builtin_ceil(double x) { return __builtin_ceil(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_ceil(long double x) { return __builtin_ceill(x); }
+inline long double builtin_ceil(long double x) { return __builtin_ceill(x); }
 #endif
-    
-  inline float builtin_copysign(float x, float y) { return __builtin_copysignf(x, y); }
-  inline double builtin_copysign(double x, double y) { return __builtin_copysign(x, y); }
+
+inline float builtin_copysign(float x, float y) {
+  return __builtin_copysignf(x, y);
+}
+inline double builtin_copysign(double x, double y) {
+  return __builtin_copysign(x, y);
+}
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_copysign(long double x, long double y) { return __builtin_copysignl(x, y); }
+inline long double builtin_copysign(long double x, long double y) {
+  return __builtin_copysignl(x, y);
+}
 #endif
 
-  inline float builtin_cos(float x) { return __builtin_cosf(x); }
-  inline double builtin_cos(double x) { return __builtin_cos(x); }
+inline float builtin_cos(float x) { return __builtin_cosf(x); }
+inline double builtin_cos(double x) { return __builtin_cos(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_cos(long double x) { return __builtin_cosl(x); }
+inline long double builtin_cos(long double x) { return __builtin_cosl(x); }
 #endif
-  
-  inline float builtin_cosh(float x) { return __builtin_coshf(x); }
-  inline double builtin_cosh(double x) { return __builtin_cosh(x); }
+
+inline float builtin_cosh(float x) { return __builtin_coshf(x); }
+inline double builtin_cosh(double x) { return __builtin_cosh(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_cosh(long double x) { return __builtin_coshl(x); }
+inline long double builtin_cosh(long double x) { return __builtin_coshl(x); }
 #endif
 
-  inline float builtin_exp(float x) { return __builtin_expf(x); }
-  inline double builtin_exp(double x) { return __builtin_exp(x); }
+inline float builtin_exp(float x) { return __builtin_expf(x); }
+inline double builtin_exp(double x) { return __builtin_exp(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_exp(long double x) { return __builtin_expl(x); }
+inline long double builtin_exp(long double x) { return __builtin_expl(x); }
 #endif
-  
-  inline float builtin_exp2(float x) { return __builtin_exp2f(x); }
-  inline double builtin_exp2(double x) { return __builtin_exp2(x); }
+
+inline float builtin_exp2(float x) { return __builtin_exp2f(x); }
+inline double builtin_exp2(double x) { return __builtin_exp2(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_exp2(long double x) { return __builtin_exp2l(x); }
+inline long double builtin_exp2(long double x) { return __builtin_exp2l(x); }
 #endif
 
-  inline float builtin_expm1(float x) { return __builtin_expm1f(x); }
-  inline double builtin_expm1(double x) { return __builtin_expm1(x); }
+inline float builtin_expm1(float x) { return __builtin_expm1f(x); }
+inline double builtin_expm1(double x) { return __builtin_expm1(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_expm1(long double x) { return __builtin_expm1l(x); }
+inline long double builtin_expm1(long double x) { return __builtin_expm1l(x); }
 #endif
 
-  inline float builtin_fabs(float x) { return __builtin_fabsf(x); }
-  inline double builtin_fabs(double x) { return __builtin_fabs(x); }
+inline float builtin_fabs(float x) { return __builtin_fabsf(x); }
+inline double builtin_fabs(double x) { return __builtin_fabs(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_fabs(long double x) { return __builtin_fabsl(x); }
+inline long double builtin_fabs(long double x) { return __builtin_fabsl(x); }
 #endif
-  
-  inline float builtin_fdim(float x, float y) { return __builtin_fdimf(x, y); }
-  inline double builtin_fdim(double x, double y) { return __builtin_fdim(x, y); }
+
+inline float builtin_fdim(float x, float y) { return __builtin_fdimf(x, y); }
+inline double builtin_fdim(double x, double y) { return __builtin_fdim(x, y); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_fdim(long double x, long double y) { return __builtin_fdiml(x, y); }
+inline long double builtin_fdim(long double x, long double y) {
+  return __builtin_fdiml(x, y);
+}
 #endif
-  
-  inline float builtin_floor(float x) { return __builtin_floorf(x); }
-  inline double builtin_floor(double x) { return __builtin_floor(x); }
+
+inline float builtin_floor(float x) { return __builtin_floorf(x); }
+inline double builtin_floor(double x) { return __builtin_floor(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_floor(long double x) { return __builtin_floorl(x); }
+inline long double builtin_floor(long double x) { return __builtin_floorl(x); }
 #endif
-  
-  inline float builtin_fma(float x, float y, float z) { return __builtin_fmaf(x, y, z); }
-  inline double builtin_fma(double x, double y, double z) { return __builtin_fma(x, y, z); }
+
+inline float builtin_fma(float x, float y, float z) {
+  return __builtin_fmaf(x, y, z);
+}
+inline double builtin_fma(double x, double y, double z) {
+  return __builtin_fma(x, y, z);
+}
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_fma(long double x, long double y, long double z) { return __builtin_fmal(x, y, z); }
+inline long double builtin_fma(long double x, long double y, long double z) {
+  return __builtin_fmal(x, y, z);
+}
 #endif
-  
-  inline float builtin_fmax(float x, float y) { return __builtin_fmaxf(x, y); }
-  inline double builtin_fmax(double x, double y) { return __builtin_fmax(x, y); }
+
+inline float builtin_fmax(float x, float y) { return __builtin_fmaxf(x, y); }
+inline double builtin_fmax(double x, double y) { return __builtin_fmax(x, y); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_fmax(long double x, long double y) { return __builtin_fmaxl(x, y); }
+inline long double builtin_fmax(long double x, long double y) {
+  return __builtin_fmaxl(x, y);
+}
 #endif
-  
-  inline float builtin_fmin(float x, float y) { return __builtin_fminf(x, y); }
-  inline double builtin_fmin(double x, double y) { return __builtin_fmin(x, y); }
+
+inline float builtin_fmin(float x, float y) { return __builtin_fminf(x, y); }
+inline double builtin_fmin(double x, double y) { return __builtin_fmin(x, y); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_fmin(long double x, long double y) { return __builtin_fminl(x, y); }
+inline long double builtin_fmin(long double x, long double y) {
+  return __builtin_fminl(x, y);
+}
 #endif
-  
-  inline float builtin_fmod(float x, float y) { return __builtin_fmodf(x, y); }
-  inline double builtin_fmod(double x, double y) { return __builtin_fmod(x, y); }
+
+inline float builtin_fmod(float x, float y) { return __builtin_fmodf(x, y); }
+inline double builtin_fmod(double x, double y) { return __builtin_fmod(x, y); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_fmod(long double x, long double y) { return __builtin_fmodl(x, y); }
+inline long double builtin_fmod(long double x, long double y) {
+  return __builtin_fmodl(x, y);
+}
 #endif
-  
-  inline float builtin_frexp(float x, int* r) { return __builtin_frexpf(x, r); }
-  inline double builtin_frexp(double x, int* r) { return __builtin_frexp(x, r); }
+
+inline float builtin_frexp(float x, int *r) { return __builtin_frexpf(x, r); }
+inline double builtin_frexp(double x, int *r) { return __builtin_frexp(x, r); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_frexp(long double x, int* r) { return __builtin_frexpl(x, r); }
+inline long double builtin_frexp(long double x, int *r) {
+  return __builtin_frexpl(x, r);
+}
 #endif
-  
-  inline float builtin_hypot(float x, float y) { return __builtin_hypotf(x, y); }
-  inline double builtin_hypot(double x, double y) { return __builtin_hypot(x, y); }
+
+inline float builtin_hypot(float x, float y) { return __builtin_hypotf(x, y); }
+inline double builtin_hypot(double x, double y) {
+  return __builtin_hypot(x, y);
+}
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_hypot(long double x, long double y) { return __builtin_hypotl(x, y); }
+inline long double builtin_hypot(long double x, long double y) {
+  return __builtin_hypotl(x, y);
+}
 #endif
-  
-  inline int builtin_ilogb(float x) { return __builtin_ilogbf(x); }
-  inline int builtin_ilogb(double x) { return __builtin_ilogb(x); }
+
+inline int builtin_ilogb(float x) { return __builtin_ilogbf(x); }
+inline int builtin_ilogb(double x) { return __builtin_ilogb(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline int builtin_ilogb(long double x) { return __builtin_ilogbl(x); }
+inline int builtin_ilogb(long double x) { return __builtin_ilogbl(x); }
 #endif
-  
-  inline int builtin_isfinite(float x) { return __builtin_isfinite(x); }
-  inline int builtin_isfinite(double x) { return __builtin_isfinite(x); }
+
+inline int builtin_isfinite(float x) { return __builtin_isfinite(x); }
+inline int builtin_isfinite(double x) { return __builtin_isfinite(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline int builtin_isfinite(long double x) { return __builtin_isfinite(x); }
+inline int builtin_isfinite(long double x) { return __builtin_isfinite(x); }
 #endif
-  
-  inline int builtin_isinf(float x) { return __builtin_isinf(x); }
-  inline int builtin_isinf(double x) { return __builtin_isinf(x); }
+
+inline int builtin_isinf(float x) { return __builtin_isinf(x); }
+inline int builtin_isinf(double x) { return __builtin_isinf(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline int builtin_isinf(long double x) { return __builtin_isinf(x); }
+inline int builtin_isinf(long double x) { return __builtin_isinf(x); }
 #endif
-  
-  inline int builtin_isnan(float x) { return __builtin_isnan(x); }
-  inline int builtin_isnan(double x) { return __builtin_isnan(x); }
+
+inline int builtin_isnan(float x) { return __builtin_isnan(x); }
+inline int builtin_isnan(double x) { return __builtin_isnan(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline int builtin_isnan(long double x) { return __builtin_isnan(x); }
+inline int builtin_isnan(long double x) { return __builtin_isnan(x); }
 #endif
-  
-  inline int builtin_isnormal(float x) { return __builtin_isnormal(x); }
-  inline int builtin_isnormal(double x) { return __builtin_isnormal(x); }
+
+inline int builtin_isnormal(float x) { return __builtin_isnormal(x); }
+inline int builtin_isnormal(double x) { return __builtin_isnormal(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline int builtin_isnormal(long double x) { return __builtin_isnormal(x); }
+inline int builtin_isnormal(long double x) { return __builtin_isnormal(x); }
 #endif
-  
-  inline float builtin_ldexp(float x, int y) { return __builtin_ldexpf(x, y); }
-  inline double builtin_ldexp(double x, int y) { return __builtin_ldexp(x, y); }
+
+inline float builtin_ldexp(float x, int y) { return __builtin_ldexpf(x, y); }
+inline double builtin_ldexp(double x, int y) { return __builtin_ldexp(x, y); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_ldexp(long double x, int y) { return __builtin_ldexpl(x, y); }
+inline long double builtin_ldexp(long double x, int y) {
+  return __builtin_ldexpl(x, y);
+}
 #endif
-  
-  inline long long builtin_llrint(float x) { return __builtin_llrintf(x); }
-  inline long long builtin_llrint(double x) { return __builtin_llrint(x); }
+
+inline long long builtin_llrint(float x) { return __builtin_llrintf(x); }
+inline long long builtin_llrint(double x) { return __builtin_llrint(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long long builtin_llrint(long double x) { return __builtin_llrintl(x); }
+inline long long builtin_llrint(long double x) { return __builtin_llrintl(x); }
 #endif
 
-  inline float builtin_log(float x) { return __builtin_logf(x); }
-  inline double builtin_log(double x) { return __builtin_log(x); }
+inline float builtin_log(float x) { return __builtin_logf(x); }
+inline double builtin_log(double x) { return __builtin_log(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_log(long double x) { return __builtin_logl(x); }
+inline long double builtin_log(long double x) { return __builtin_logl(x); }
 #endif
 
-  inline float builtin_log10(float x) { return __builtin_log10f(x); }
-  inline double builtin_log10(double x) { return __builtin_log10(x); }
+inline float builtin_log10(float x) { return __builtin_log10f(x); }
+inline double builtin_log10(double x) { return __builtin_log10(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_log10(long double x) { return __builtin_log10l(x); }
+inline long double builtin_log10(long double x) { return __builtin_log10l(x); }
 #endif
 
-  inline float builtin_log1p(float x) { return __builtin_log1pf(x); }
-  inline double builtin_log1p(double x) { return __builtin_log1p(x); }
+inline float builtin_log1p(float x) { return __builtin_log1pf(x); }
+inline double builtin_log1p(double x) { return __builtin_log1p(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_log1p(long double x) { return __builtin_log1pl(x); }
+inline long double builtin_log1p(long double x) { return __builtin_log1pl(x); }
 #endif
 
-  inline float builtin_log2(float x) { return __builtin_log2f(x); }
-  inline double builtin_log2(double x) { return __builtin_log2(x); }
+inline float builtin_log2(float x) { return __builtin_log2f(x); }
+inline double builtin_log2(double x) { return __builtin_log2(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_log2(long double x) { return __builtin_log2l(x); }
+inline long double builtin_log2(long double x) { return __builtin_log2l(x); }
 #endif
-  
-  inline long builtin_lrint(float x) { return __builtin_lrintf(x); }
-  inline long builtin_lrint(double x) { return __builtin_lrint(x); }
+
+inline long builtin_lrint(float x) { return __builtin_lrintf(x); }
+inline long builtin_lrint(double x) { return __builtin_lrint(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long builtin_lrint(long double x) { return __builtin_lrintl(x); }
+inline long builtin_lrint(long double x) { return __builtin_lrintl(x); }
 #endif
-  
-  inline float builtin_nextafter(float x, float y) { return __builtin_nextafterf(x, y); }
-  inline double builtin_nextafter(double x, double y) { return __builtin_nextafter(x, y); }
+
+inline float builtin_nextafter(float x, float y) {
+  return __builtin_nextafterf(x, y);
+}
+inline double builtin_nextafter(double x, double y) {
+  return __builtin_nextafter(x, y);
+}
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_nextafter(long double x, long double y) { return __builtin_nextafterl(x, y); }
+inline long double builtin_nextafter(long double x, long double y) {
+  return __builtin_nextafterl(x, y);
+}
 #endif
-  
-  inline float builtin_pow(float x, float y) { return __builtin_powf(x, y); }
-  inline double builtin_pow(double x, double y) { return __builtin_pow(x, y); }
+
+inline float builtin_pow(float x, float y) { return __builtin_powf(x, y); }
+inline double builtin_pow(double x, double y) { return __builtin_pow(x, y); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_pow(long double x, long double y) { return __builtin_powl(x, y); }
+inline long double builtin_pow(long double x, long double y) {
+  return __builtin_powl(x, y);
+}
 #endif
-  
-  inline float builtin_remainder(float x, float y) { return __builtin_remainderf(x, y); }
-  inline double builtin_remainder(double x, double y) { return __builtin_remainder(x, y); }
+
+inline float builtin_remainder(float x, float y) {
+  return __builtin_remainderf(x, y);
+}
+inline double builtin_remainder(double x, double y) {
+  return __builtin_remainder(x, y);
+}
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_remainder(long double x, long double y) { return __builtin_remainderl(x, y); }
+inline long double builtin_remainder(long double x, long double y) {
+  return __builtin_remainderl(x, y);
+}
 #endif
 
-  inline float builtin_rint(float x) { return __builtin_rintf(x); }
-  inline double builtin_rint(double x) { return __builtin_rint(x); }
+inline float builtin_rint(float x) { return __builtin_rintf(x); }
+inline double builtin_rint(double x) { return __builtin_rint(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_rint(long double x) { return __builtin_rintl(x); }
+inline long double builtin_rint(long double x) { return __builtin_rintl(x); }
 #endif
 
-  inline float builtin_round(float x) { return __builtin_roundf(x); }
-  inline double builtin_round(double x) { return __builtin_round(x); }
+inline float builtin_round(float x) { return __builtin_roundf(x); }
+inline double builtin_round(double x) { return __builtin_round(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_round(long double x) { return __builtin_roundl(x); }
+inline long double builtin_round(long double x) { return __builtin_roundl(x); }
 #endif
-  
-  inline int builtin_signbit(float x) { return __builtin_signbitf(x); }
-  inline int builtin_signbit(double x) { return __builtin_signbit(x); }
+
+inline int builtin_signbit(float x) { return __builtin_signbitf(x); }
+inline int builtin_signbit(double x) { return __builtin_signbit(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline int builtin_signbit(long double x) { return __builtin_signbitl(x); }
+inline int builtin_signbit(long double x) { return __builtin_signbitl(x); }
 #endif
 
-  inline float builtin_sin(float x) { return __builtin_sinf(x); }
-  inline double builtin_sin(double x) { return __builtin_sin(x); }
+inline float builtin_sin(float x) { return __builtin_sinf(x); }
+inline double builtin_sin(double x) { return __builtin_sin(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_sin(long double x) { return __builtin_sinl(x); }
+inline long double builtin_sin(long double x) { return __builtin_sinl(x); }
 #endif
-  
-  inline float builtin_sinh(float x) { return __builtin_sinhf(x); }
-  inline double builtin_sinh(double x) { return __builtin_sinh(x); }
+
+inline float builtin_sinh(float x) { return __builtin_sinhf(x); }
+inline double builtin_sinh(double x) { return __builtin_sinh(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_sinh(long double x) { return __builtin_sinhl(x); }
+inline long double builtin_sinh(long double x) { return __builtin_sinhl(x); }
 #endif
-  
-  inline float builtin_sqrt(float x) { return __builtin_sqrtf(x); }
-  inline double builtin_sqrt(double x) { return __builtin_sqrt(x); }
+
+inline float builtin_sqrt(float x) { return __builtin_sqrtf(x); }
+inline double builtin_sqrt(double x) { return __builtin_sqrt(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_sqrt(long double x) { return __builtin_sqrtl(x); }
+inline long double builtin_sqrt(long double x) { return __builtin_sqrtl(x); }
 #endif
 
-  inline float builtin_tan(float x) { return __builtin_tanf(x); }
-  inline double builtin_tan(double x) { return __builtin_tan(x); }
+inline float builtin_tan(float x) { return __builtin_tanf(x); }
+inline double builtin_tan(double x) { return __builtin_tan(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_tan(long double x) { return __builtin_tanl(x); }
+inline long double builtin_tan(long double x) { return __builtin_tanl(x); }
 #endif
-  
-  inline float builtin_tanh(float x) { return __builtin_tanhf(x); }
-  inline double builtin_tanh(double x) { return __builtin_tanh(x); }
+
+inline float builtin_tanh(float x) { return __builtin_tanhf(x); }
+inline double builtin_tanh(double x) { return __builtin_tanh(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_tanh(long double x) { return __builtin_tanhl(x); }
+inline long double builtin_tanh(long double x) { return __builtin_tanhl(x); }
 #endif
-  
-  inline float builtin_trunc(float x) { return __builtin_truncf(x); }
-  inline double builtin_trunc(double x) { return __builtin_trunc(x); }
+
+inline float builtin_trunc(float x) { return __builtin_truncf(x); }
+inline double builtin_trunc(double x) { return __builtin_trunc(x); }
 #if __SIZEOF_LONG_DOUBLE__
-  inline long double builtin_trunc(long double x) { return __builtin_truncl(x); }
+inline long double builtin_trunc(long double x) { return __builtin_truncl(x); }
 #endif
-  
 }
 
 #endif
 
-#endif  // #ifndef FLOATBUILTINS_H
+#endif // #ifndef FLOATBUILTINS_H
diff --git a/floatprops.h b/floatprops.h
index f1c39a2..c7a3b7f 100644
--- a/floatprops.h
+++ b/floatprops.h
@@ -10,310 +10,279 @@
 #include <cstring>
 #include <limits>
 
+namespace vecmathlib {
 
+// A structure describing various properties of a floating point
+// type. Most properties are already described in numeric_limits, so
+// we inherit it.
+template <typename real_t> struct floatprops {
+  // Some interesting properties are:
+  //    min
+  //    max
+  //    digits
+  //    epsilon
+  //    min_exponent
+  //    max_exponent
+  //    infinity
+  //    quiet_NaN
+};
 
-namespace vecmathlib {
-  
-  // A structure describing various properties of a floating point
-  // type. Most properties are already described in numeric_limits, so
-  // we inherit it.
-  template<typename real_t>
-  struct floatprops {
-    // Some interesting properties are:
-    //    min
-    //    max
-    //    digits
-    //    epsilon
-    //    min_exponent
-    //    max_exponent
-    //    infinity
-    //    quiet_NaN
-  };
-  
-  
-  
-  // Properties of fp8
-  template<>
-  struct floatprops<fp8> {
-    typedef fp8 real_t;
-    typedef vml_std::int8_t int_t;
-    typedef vml_std::uint8_t uint_t;
-    
-    static char const* name() { return "fp8"; }
-    
-    // Definitions that might come from numeric_limits<> instead:
-    static real_t min() { __builtin_unreachable(); }
-    static real_t max() { __builtin_unreachable(); }
-    static int const digits = 4;
-    static real_t epsilon() { __builtin_unreachable(); }
-    static int const min_exponent = -6;
-    static int const max_exponent = 7;
-    static real_t infinity() { __builtin_unreachable(); }
-    static real_t quiet_NaN() { __builtin_unreachable(); }
-    
-    // Ensure the sizes match
-    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
-    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
-    
-    // Number of bits in internal representation
-    static int const bits = 8 * sizeof(real_t);
-    static int const mantissa_bits = digits - 1;
-    static int const signbit_bits = 1;
-    static int const exponent_bits = bits - mantissa_bits - signbit_bits;
-    static int const exponent_offset = 2 - min_exponent;
-    static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
-                  "error in bit counts");
-    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
-    static uint_t const exponent_mask =
-      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
-    static uint_t const signbit_mask = uint_t(1) << (bits-1);
-    static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
-                  "error in masks");
-    static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
-                  uint_t(~uint_t(0)),
-                  "error in masks");
-    
-    // Re-interpret bit patterns
-    static real_t as_float(int_t x)
-    {
-      real_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t as_int(real_t x)
-    {
-      int_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t replicate_byte(unsigned char byte)
-    {
-      int_t res;
-      std::memset(&res, byte, sizeof res);
-      return res;
-    }
-    
-    // Convert values (truncate)
-    static real_t convert_float(int_t x) { __builtin_unreachable(); }
-    static int_t convert_int(real_t x) { __builtin_unreachable(); }
-  };
-  
-  
-  
-  // Properties of fp16
-  template<>
-  struct floatprops<fp16> {
-    typedef fp16 real_t;
-    typedef vml_std::int16_t int_t;
-    typedef vml_std::uint16_t uint_t;
-    
-    static char const* name() { return "fp16"; }
-    
-    // Definitions that might come from numeric_limits<> instead:
-    static real_t min() { __builtin_unreachable(); }
-    static real_t max() { __builtin_unreachable(); }
-    static int const digits = 11;
-    static real_t epsilon() { __builtin_unreachable(); }
-    static int const min_exponent = -14;
-    static int const max_exponent = 15;
-    static real_t infinity() { __builtin_unreachable(); }
-    static real_t quiet_NaN() { __builtin_unreachable(); }
-    
-    // Ensure the sizes match
-    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
-    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
-    
-    // Number of bits in internal representation
-    static int const bits = 8 * sizeof(real_t);
-    static int const mantissa_bits = digits - 1;
-    static int const signbit_bits = 1;
-    static int const exponent_bits = bits - mantissa_bits - signbit_bits;
-    static int const exponent_offset = 2 - min_exponent;
-    static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
-                  "error in bit counts");
-    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
-    static uint_t const exponent_mask =
-      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
-    static uint_t const signbit_mask = uint_t(1) << (bits-1);
-    static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
-                  "error in masks");
-    static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
-                  uint_t(~uint_t(0)),
-                  "error in masks");
-    
-    // Re-interpret bit patterns
-    static real_t as_float(int_t x)
-    {
-      real_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t as_int(real_t x)
-    {
-      int_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t replicate_byte(unsigned char byte)
-    {
-      int_t res;
-      std::memset(&res, byte, sizeof res);
-      return res;
-    }
-    
-    // Convert values (truncate)
-    static real_t convert_float(int_t x) { __builtin_unreachable(); }
-    static int_t convert_int(real_t x) { __builtin_unreachable(); }
-  };
-  
-  
-  
-  // Properties of float
-  template<>
-  struct floatprops<float>: std::numeric_limits<float> {
-    typedef float real_t;
-    typedef vml_std::int32_t int_t;
-    typedef vml_std::uint32_t uint_t;
-    
-    static char const* name() { return "float"; }
-    
-    // Ensure the internal representation is what we expect
-    static_assert(is_signed, "real_t is not signed");
-    static_assert(radix==2, "real_t is not binary");
-    
-    // Ensure the sizes match
-    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
-    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
-    
-    // Number of bits in internal representation
-    static int const bits = 8 * sizeof(real_t);
-    static int const mantissa_bits = digits - 1;
-    static int const signbit_bits = 1;
-    static int const exponent_bits = bits - mantissa_bits - signbit_bits;
-    static int const exponent_offset = 2 - min_exponent;
-    static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
-                  "error in bit counts");
-    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
-    static uint_t const exponent_mask =
-      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
-    static uint_t const signbit_mask = uint_t(1) << (bits-1);
-    static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
-                  "error in masks");
-    static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
-                  uint_t(~uint_t(0)),
-                  "error in masks");
-    
-    // Re-interpret bit patterns
-    static real_t as_float(int_t x)
-    {
-      real_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t as_int(real_t x)
-    {
-      int_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t replicate_byte(unsigned char byte)
-    {
-      int_t res;
-      std::memset(&res, byte, sizeof res);
-      return res;
-    }
-    
-    // Convert values (truncate)
-    static real_t convert_float(int_t x) { return real_t(x); }
-    static int_t convert_int(real_t x) { return int_t(x); }
-  };
-  
-  
-  
-  // Properties of double
-  template<>
-  struct floatprops<double>: std::numeric_limits<double> {
-    typedef double real_t;
-    typedef vml_std::int64_t int_t;
-    typedef vml_std::uint64_t uint_t;
-    
-    static char const* name() { return "double"; }
-    
-    // Ensure the internal representation is what we expect
-    static_assert(is_signed, "real_t is not signed");
-    static_assert(radix==2, "real_t is not binary");
-    
-    // Ensure the sizes match
-    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
-    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
-    
-    // Number of bits in internal representation
-    static int const bits = 8 * sizeof(real_t);
-    static int const mantissa_bits = digits - 1;
-    static int const signbit_bits = 1;
-    static int const exponent_bits = bits - mantissa_bits - signbit_bits;
-    static int const exponent_offset = 2 - min_exponent;
-    static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
-                  "error in bit counts");
-    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
-    static uint_t const exponent_mask =
-      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
-    static uint_t const signbit_mask = uint_t(1) << (bits-1);
-    static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
-                  "error in masks");
-    static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
-                  uint_t(~uint_t(0)),
-                  "error in masks");
-    
-    // Re-interpret bit patterns
-    static real_t as_float(int_t x)
-    {
-      real_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t as_int(real_t x)
-    {
-      int_t res;
-      std::memcpy(&res, &x, sizeof res);
-      return res;
-    }
-    static int_t replicate_byte(unsigned char byte)
-    {
-      int_t res;
-      std::memset(&res, byte, sizeof res);
-      return res;
-    }
-    
-    // Convert values (truncate)
-    static real_t convert_float(int_t x) { return real_t(x); }
-    static int_t convert_int(real_t x) { return int_t(x); }
-  };
-  
-  
-  
-  // We are adding the (unused) type RV here to avoid name mangling
-  // problems. On some systems, the vector size does not enter into
-  // the mangled name (!), leading to duplicate function definitions.
-  template<typename RV, typename V, typename E>
-  E get_elt(const V& v, const int n)
-  {
-    const size_t s = sizeof(E);
-    E e;
-    // assert(n>=0 and s*n<sizeof(V));
-    std::memcpy(&e, &((const char*)&v)[s*n], s);
-    return e;
+// Properties of fp8
+template <> struct floatprops<fp8> {
+  typedef fp8 real_t;
+  typedef vml_std::int8_t int_t;
+  typedef vml_std::uint8_t uint_t;
+
+  static char const *name() { return "fp8"; }
+
+  // Definitions that might come from numeric_limits<> instead:
+  static real_t min() { __builtin_unreachable(); }
+  static real_t max() { __builtin_unreachable(); }
+  static int const digits = 4;
+  static real_t epsilon() { __builtin_unreachable(); }
+  static int const min_exponent = -6;
+  static int const max_exponent = 7;
+  static real_t infinity() { __builtin_unreachable(); }
+  static real_t quiet_NaN() { __builtin_unreachable(); }
+
+  // Ensure the sizes match
+  static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+  static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+  // Number of bits in internal representation
+  static int const bits = 8 * sizeof(real_t);
+  static int const mantissa_bits = digits - 1;
+  static int const signbit_bits = 1;
+  static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+  static int const exponent_offset = 2 - min_exponent;
+  static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+                "error in bit counts");
+  static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+  static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1)
+                                      << mantissa_bits;
+  static uint_t const signbit_mask = uint_t(1) << (bits - 1);
+  static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+                "error in masks");
+  static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+                    uint_t(~uint_t(0)),
+                "error in masks");
+
+  // Re-interpret bit patterns
+  static real_t as_float(int_t x) {
+    real_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
+  }
+  static int_t as_int(real_t x) {
+    int_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
+  }
+  static int_t replicate_byte(unsigned char byte) {
+    int_t res;
+    std::memset(&res, byte, sizeof res);
+    return res;
+  }
+
+  // Convert values (truncate)
+  static real_t convert_float(int_t x) { __builtin_unreachable(); }
+  static int_t convert_int(real_t x) { __builtin_unreachable(); }
+};
+
+// Properties of fp16
+template <> struct floatprops<fp16> {
+  typedef fp16 real_t;
+  typedef vml_std::int16_t int_t;
+  typedef vml_std::uint16_t uint_t;
+
+  static char const *name() { return "fp16"; }
+
+  // Definitions that might come from numeric_limits<> instead:
+  static real_t min() { __builtin_unreachable(); }
+  static real_t max() { __builtin_unreachable(); }
+  static int const digits = 11;
+  static real_t epsilon() { __builtin_unreachable(); }
+  static int const min_exponent = -14;
+  static int const max_exponent = 15;
+  static real_t infinity() { __builtin_unreachable(); }
+  static real_t quiet_NaN() { __builtin_unreachable(); }
+
+  // Ensure the sizes match
+  static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+  static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+  // Number of bits in internal representation
+  static int const bits = 8 * sizeof(real_t);
+  static int const mantissa_bits = digits - 1;
+  static int const signbit_bits = 1;
+  static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+  static int const exponent_offset = 2 - min_exponent;
+  static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+                "error in bit counts");
+  static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+  static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1)
+                                      << mantissa_bits;
+  static uint_t const signbit_mask = uint_t(1) << (bits - 1);
+  static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+                "error in masks");
+  static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+                    uint_t(~uint_t(0)),
+                "error in masks");
+
+  // Re-interpret bit patterns
+  static real_t as_float(int_t x) {
+    real_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
+  }
+  static int_t as_int(real_t x) {
+    int_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
+  }
+  static int_t replicate_byte(unsigned char byte) {
+    int_t res;
+    std::memset(&res, byte, sizeof res);
+    return res;
+  }
+
+  // Convert values (truncate)
+  static real_t convert_float(int_t x) { __builtin_unreachable(); }
+  static int_t convert_int(real_t x) { __builtin_unreachable(); }
+};
+
+// Properties of float
+template <> struct floatprops<float> : std::numeric_limits<float> {
+  typedef float real_t;
+  typedef vml_std::int32_t int_t;
+  typedef vml_std::uint32_t uint_t;
+
+  static char const *name() { return "float"; }
+
+  // Ensure the internal representation is what we expect
+  static_assert(is_signed, "real_t is not signed");
+  static_assert(radix == 2, "real_t is not binary");
+
+  // Ensure the sizes match
+  static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+  static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+  // Number of bits in internal representation
+  static int const bits = 8 * sizeof(real_t);
+  static int const mantissa_bits = digits - 1;
+  static int const signbit_bits = 1;
+  static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+  static int const exponent_offset = 2 - min_exponent;
+  static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+                "error in bit counts");
+  static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+  static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1)
+                                      << mantissa_bits;
+  static uint_t const signbit_mask = uint_t(1) << (bits - 1);
+  static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+                "error in masks");
+  static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+                    uint_t(~uint_t(0)),
+                "error in masks");
+
+  // Re-interpret bit patterns
+  static real_t as_float(int_t x) {
+    real_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
   }
-  
-  template<typename RV, typename V, typename E>
-  V& set_elt(V& v, const int n, const E e)
-  {
-    const size_t s = sizeof(E);
-    // assert(n>=0 and s*n<sizeof(V));
-    std::memcpy(&((char*)&v)[s*n], &e, s);
-    return v;
+  static int_t as_int(real_t x) {
+    int_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
   }
-  
+  static int_t replicate_byte(unsigned char byte) {
+    int_t res;
+    std::memset(&res, byte, sizeof res);
+    return res;
+  }
+
+  // Convert values (truncate)
+  static real_t convert_float(int_t x) { return real_t(x); }
+  static int_t convert_int(real_t x) { return int_t(x); }
+};
+
+// Properties of double
+template <> struct floatprops<double> : std::numeric_limits<double> {
+  typedef double real_t;
+  typedef vml_std::int64_t int_t;
+  typedef vml_std::uint64_t uint_t;
+
+  static char const *name() { return "double"; }
+
+  // Ensure the internal representation is what we expect
+  static_assert(is_signed, "real_t is not signed");
+  static_assert(radix == 2, "real_t is not binary");
+
+  // Ensure the sizes match
+  static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+  static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+  // Number of bits in internal representation
+  static int const bits = 8 * sizeof(real_t);
+  static int const mantissa_bits = digits - 1;
+  static int const signbit_bits = 1;
+  static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+  static int const exponent_offset = 2 - min_exponent;
+  static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+                "error in bit counts");
+  static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+  static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1)
+                                      << mantissa_bits;
+  static uint_t const signbit_mask = uint_t(1) << (bits - 1);
+  static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+                "error in masks");
+  static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+                    uint_t(~uint_t(0)),
+                "error in masks");
+
+  // Re-interpret bit patterns
+  static real_t as_float(int_t x) {
+    real_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
+  }
+  static int_t as_int(real_t x) {
+    int_t res;
+    std::memcpy(&res, &x, sizeof res);
+    return res;
+  }
+  static int_t replicate_byte(unsigned char byte) {
+    int_t res;
+    std::memset(&res, byte, sizeof res);
+    return res;
+  }
+
+  // Convert values (truncate)
+  static real_t convert_float(int_t x) { return real_t(x); }
+  static int_t convert_int(real_t x) { return int_t(x); }
+};
+
+// We are adding the (unused) type RV here to avoid name mangling
+// problems. On some systems, the vector size does not enter into
+// the mangled name (!), leading to duplicate function definitions.
+template <typename RV, typename V, typename E>
+E get_elt(const V &v, const int n) {
+  const size_t s = sizeof(E);
+  E e;
+  // assert(n>=0 and s*n<sizeof(V));
+  std::memcpy(&e, &((const char *)&v)[s * n], s);
+  return e;
+}
+
+template <typename RV, typename V, typename E>
+V &set_elt(V &v, const int n, const E e) {
+  const size_t s = sizeof(E);
+  // assert(n>=0 and s*n<sizeof(V));
+  std::memcpy(&((char *)&v)[s * n], &e, s);
+  return v;
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef FLOATPROPS_H
+#endif // #ifndef FLOATPROPS_H
diff --git a/floattypes.h b/floattypes.h
index 5107af6..e037b95 100644
--- a/floattypes.h
+++ b/floattypes.h
@@ -3,20 +3,14 @@
 #ifndef FLOATTYPES_H
 #define FLOATTYPES_H
 
-
-
 #include <cassert>
 #include <cstdlib>
 
-
-
-#if ! (defined __clang__ || defined __gcc__)
-#  define __builtin_unreachable() (assert(0))
-#  define __builtin_expect(expr, val) (expr)
+#if !(defined __clang__ || defined __gcc__)
+#define __builtin_unreachable() (assert(0))
+#define __builtin_expect(expr, val) (expr)
 #endif
 
-
-
 // We expect either 199711L or 201103L
 #if __cplusplus >= 201103L
 // C++11 is supported, use it
@@ -25,11 +19,9 @@
 #include <cstdint>
 
 namespace vml_std {
-  using namespace std;
+using namespace std;
 }
 
-
-
 #else
 // C++11 is not supported, work around the missing pieces
 
@@ -40,38 +32,35 @@ namespace vml_std {
 #include <stdint.h>
 
 #ifndef static_assert
-#  define VML_CONCAT2(x, y) x##y
-#  define VML_CONCAT(x, y) VML_CONCAT2(x, y)
-#  define static_assert(cond, msg)                                      \
-  typedef int VML_CONCAT(vml_static_assert_, __LINE__)[(cond) ? 1 : -1] \
-    __attribute__((__unused__))
+#define VML_CONCAT2(x, y) x##y
+#define VML_CONCAT(x, y) VML_CONCAT2(x, y)
+#define static_assert(cond, msg) typedef int VML_CONCAT(                       \
+    vml_static_assert_, __LINE__)[(cond) ? 1 : -1] __attribute__((__unused__))
 #endif
 
-
-
 // Capture libc macros, then undefine them
 #ifndef isfinite
-#  error "isfinite is not a macro"
+#error "isfinite is not a macro"
 #endif
 #ifndef isinf
-#  error "isinf is not a macro"
+#error "isinf is not a macro"
 #endif
 #ifndef isnan
-#  error "isnan is not a macro"
+#error "isnan is not a macro"
 #endif
 #ifndef isnormal
-#  error "isnormal is not a macro"
+#error "isnormal is not a macro"
 #endif
 #ifndef signbit
-#  error "signbit is not a macro"
+#error "signbit is not a macro"
 #endif
 
 namespace {
-  template<typename T> inline int libc_isfinite(T x) { return isfinite(x); }
-  template<typename T> inline int libc_isinf(T x) { return isinf(x); }
-  template<typename T> inline int libc_isnan(T x) { return isnan(x); }
-  template<typename T> inline int libc_isnormal(T x) { return isnormal(x); }
-  template<typename T> inline int libc_signbit(T x) { return signbit(x); }
+template <typename T> inline int libc_isfinite(T x) { return isfinite(x); }
+template <typename T> inline int libc_isinf(T x) { return isinf(x); }
+template <typename T> inline int libc_isnan(T x) { return isnan(x); }
+template <typename T> inline int libc_isnormal(T x) { return isnormal(x); }
+template <typename T> inline int libc_signbit(T x) { return signbit(x); }
 }
 
 // Include this before undefining the macros below
@@ -83,153 +72,146 @@ namespace {
 #undef isnormal
 #undef signbit
 
-
-
 namespace vml_std {
-  
-  // Make some type definitions from stdint.h available in std
-  typedef ::uint8_t uint8_t;
-  typedef ::int8_t int8_t;
-  typedef ::uint16_t uint16_t;
-  typedef ::int16_t int16_t;
-  typedef ::uint32_t uint32_t;
-  typedef ::int32_t int32_t;
+
+// Make some type definitions from stdint.h available in std
+typedef ::uint8_t uint8_t;
+typedef ::int8_t int8_t;
+typedef ::uint16_t uint16_t;
+typedef ::int16_t int16_t;
+typedef ::uint32_t uint32_t;
+typedef ::int32_t int32_t;
 #if __SIZEOF_LONG__ == 8
-  // Even if both "long" and "long long" have the same size, they are
-  // still different types. In many cases, it is then preferable to
-  // use "long" instead of "long long".
-  typedef unsigned long uint64_t;
-  typedef long int64_t;
+// Even if both "long" and "long long" have the same size, they are
+// still different types. In many cases, it is then preferable to
+// use "long" instead of "long long".
+typedef unsigned long uint64_t;
+typedef long int64_t;
 #else
-  typedef ::uint64_t uint64_t;
-  typedef ::int64_t int64_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t int64_t;
 #endif
-  
-  
-  
-  // Make math functions from math.h available in vml_std
-  // (We could instead take some of them -- but not all -- from std.)
-  
-  inline float acos(float x) { return ::acosf(x); }
-  inline float acosh(float x) { return ::acoshf(x); }
-  inline float asin(float x) { return ::asinf(x); }
-  inline float asinh(float x) { return ::asinhf(x); }
-  inline float atan(float x) { return ::atanf(x); }
-  inline float atan2(float x, float y) { return ::atan2f(x, y); }
-  inline float atanh(float x) { return ::atanhf(x); }
-  inline float cbrt(float x) { return ::cbrtf(x); }
-  inline float ceil(float x) { return ::ceilf(x); }
-  inline float cos(float x) { return ::cosf(x); }
-  inline float cosh(float x) { return ::coshf(x); }
-  inline float copysign(float x, float y) { return ::copysignf(x, y); }
-  inline float exp(float x) { return ::expf(x); }
-  inline float exp2(float x) { return ::exp2f(x); }
-  inline float expm1(float x) { return ::expm1f(x); }
-  inline float fabs(float x) { return ::fabsf(x); }
-  inline float fdim(float x, float y) { return ::fdimf(x, y); }
-  inline float floor(float x) { return ::floorf(x); }
-  inline float fma(float x, float y, float z) { return ::fmaf(x, y, z); }
-  inline float fmax(float x, float y) { return ::fmaxf(x, y); }
-  inline float fmin(float x, float y) { return ::fminf(x, y); }
-  inline float fmod(float x, float y) { return ::fmodf(x, y); }
-  inline float frexp(float x, int* r) { return ::frexpf(x, r); }
-  inline float hypot(float x, float y) { return ::hypotf(x, y); }
-  inline int ilogb(float x) { return ::ilogbf(x); }
-  inline bool isfinite(float x) { return libc_isfinite(x); }
-  inline bool isinf(float x) { return libc_isinf(x); }
-  inline bool isnan(float x) { return libc_isnan(x); }
-  inline bool isnormal(float x) { return libc_isnormal(x); }
-  inline float ldexp(float x, int n) { return ::ldexpf(x, n); }
-  inline long long llrint(float x) { return ::llrintf(x); }
-  inline float log(float x) { return ::logf(x); }
-  inline float log10(float x) { return ::log10f(x); }
-  inline float log1p(float x) { return ::log1pf(x); }
-  inline float log2(float x) { return ::log2f(x); }
-  inline long lrint(float x) { return ::lrintf(x); }
-  inline float nextafter(float x, float y) { return ::nextafterf(x, y); }
-  inline float pow(float x, float y) { return ::powf(x, y); }
-  inline float remainder(float x, float y) { return ::remainderf(x, y); }
-  inline float rint(float x) { return ::rintf(x); }
-  inline float round(float x) { return ::roundf(x); }
-  inline bool signbit(float x) { return libc_signbit(x); }
-  inline float sin(float x) { return ::sinf(x); }
-  inline float sinh(float x) { return ::sinhf(x); }
-  inline float sqrt(float x) { return ::sqrtf(x); }
-  inline float tan(float x) { return ::tanf(x); }
-  inline float tanh(float x) { return ::tanhf(x); }
-  inline float trunc(float x) { return ::truncf(x); }
-  
-  inline double acos(double x) { return ::acos(x); }
-  inline double acosh(double x) { return ::acosh(x); }
-  inline double asin(double x) { return ::asin(x); }
-  inline double asinh(double x) { return ::asinh(x); }
-  inline double atan(double x) { return ::atan(x); }
-  inline double atan2(double x, double y) { return ::atan2(x, y); }
-  inline double atanh(double x) { return ::atanh(x); }
-  inline double cbrt(double x) { return ::cbrt(x); }
-  inline double ceil(double x) { return ::ceil(x); }
-  inline double cos(double x) { return ::cos(x); }
-  inline double cosh(double x) { return ::cosh(x); }
-  inline double copysign(double x, double y) { return ::copysign(x, y); }
-  inline double exp(double x) { return ::exp(x); }
-  inline double exp2(double x) { return ::exp2(x); }
-  inline double expm1(double x) { return ::expm1(x); }
-  inline double fabs(double x) { return ::fabs(x); }
-  inline double fdim(double x, double y) { return ::fdim(x, y); }
-  inline double floor(double x) { return ::floor(x); }
-  inline double fma(double x, double y, double z) { return ::fma(x, y, z); }
-  inline double fmax(double x, double y) { return ::fmax(x, y); }
-  inline double fmin(double x, double y) { return ::fmin(x, y); }
-  inline double fmod(double x, double y) { return ::fmod(x, y); }
-  inline double frexp(double x, int* r) { return ::frexp(x, r); }
-  inline double hypot(double x, double y) { return ::hypot(x, y); }
-  inline int ilogb(double x) { return ::ilogb(x); }
-  inline bool isfinite(double x) { return libc_isfinite(x); }
-  inline bool isinf(double x) { return libc_isinf(x); }
-  inline bool isnan(double x) { return libc_isnan(x); }
-  inline bool isnormal(double x) { return libc_isnormal(x); }
-  inline double ldexp(double x, int n) { return ::ldexp(x, n); }
-  inline long long llrint(double x) { return ::llrint(x); }
-  inline double log(double x) { return ::log(x); }
-  inline double log10(double x) { return ::log10(x); }
-  inline double log1p(double x) { return ::log1p(x); }
-  inline double log2(double x) { return ::log2(x); }
-  inline long lrint(double x) { return ::lrint(x); }
-  inline double nextafter(double x, double y) { return ::nextafter(x, y); }
-  inline double pow(double x, double y) { return ::pow(x, y); }
-  inline double remainder(double x, double y) { return ::remainder(x, y); }
-  inline double rint(double x) { return ::rint(x); }
-  inline double round(double x) { return ::round(x); }
-  inline bool signbit(double x) { return libc_signbit(x); }
-  inline double sin(double x) { return ::sin(x); }
-  inline double sinh(double x) { return ::sinh(x); }
-  inline double sqrt(double x) { return ::sqrt(x); }
-  inline double tan(double x) { return ::tan(x); }
-  inline double tanh(double x) { return ::tanh(x); }
-  inline double trunc(double x) { return ::trunc(x); }
-  
+
+// Make math functions from math.h available in vml_std
+// (We could instead take some of them -- but not all -- from std.)
+
+inline float acos(float x) { return ::acosf(x); }
+inline float acosh(float x) { return ::acoshf(x); }
+inline float asin(float x) { return ::asinf(x); }
+inline float asinh(float x) { return ::asinhf(x); }
+inline float atan(float x) { return ::atanf(x); }
+inline float atan2(float x, float y) { return ::atan2f(x, y); }
+inline float atanh(float x) { return ::atanhf(x); }
+inline float cbrt(float x) { return ::cbrtf(x); }
+inline float ceil(float x) { return ::ceilf(x); }
+inline float cos(float x) { return ::cosf(x); }
+inline float cosh(float x) { return ::coshf(x); }
+inline float copysign(float x, float y) { return ::copysignf(x, y); }
+inline float exp(float x) { return ::expf(x); }
+inline float exp2(float x) { return ::exp2f(x); }
+inline float expm1(float x) { return ::expm1f(x); }
+inline float fabs(float x) { return ::fabsf(x); }
+inline float fdim(float x, float y) { return ::fdimf(x, y); }
+inline float floor(float x) { return ::floorf(x); }
+inline float fma(float x, float y, float z) { return ::fmaf(x, y, z); }
+inline float fmax(float x, float y) { return ::fmaxf(x, y); }
+inline float fmin(float x, float y) { return ::fminf(x, y); }
+inline float fmod(float x, float y) { return ::fmodf(x, y); }
+inline float frexp(float x, int *r) { return ::frexpf(x, r); }
+inline float hypot(float x, float y) { return ::hypotf(x, y); }
+inline int ilogb(float x) { return ::ilogbf(x); }
+inline bool isfinite(float x) { return libc_isfinite(x); }
+inline bool isinf(float x) { return libc_isinf(x); }
+inline bool isnan(float x) { return libc_isnan(x); }
+inline bool isnormal(float x) { return libc_isnormal(x); }
+inline float ldexp(float x, int n) { return ::ldexpf(x, n); }
+inline long long llrint(float x) { return ::llrintf(x); }
+inline float log(float x) { return ::logf(x); }
+inline float log10(float x) { return ::log10f(x); }
+inline float log1p(float x) { return ::log1pf(x); }
+inline float log2(float x) { return ::log2f(x); }
+inline long lrint(float x) { return ::lrintf(x); }
+inline float nextafter(float x, float y) { return ::nextafterf(x, y); }
+inline float pow(float x, float y) { return ::powf(x, y); }
+inline float remainder(float x, float y) { return ::remainderf(x, y); }
+inline float rint(float x) { return ::rintf(x); }
+inline float round(float x) { return ::roundf(x); }
+inline bool signbit(float x) { return libc_signbit(x); }
+inline float sin(float x) { return ::sinf(x); }
+inline float sinh(float x) { return ::sinhf(x); }
+inline float sqrt(float x) { return ::sqrtf(x); }
+inline float tan(float x) { return ::tanf(x); }
+inline float tanh(float x) { return ::tanhf(x); }
+inline float trunc(float x) { return ::truncf(x); }
+
+inline double acos(double x) { return ::acos(x); }
+inline double acosh(double x) { return ::acosh(x); }
+inline double asin(double x) { return ::asin(x); }
+inline double asinh(double x) { return ::asinh(x); }
+inline double atan(double x) { return ::atan(x); }
+inline double atan2(double x, double y) { return ::atan2(x, y); }
+inline double atanh(double x) { return ::atanh(x); }
+inline double cbrt(double x) { return ::cbrt(x); }
+inline double ceil(double x) { return ::ceil(x); }
+inline double cos(double x) { return ::cos(x); }
+inline double cosh(double x) { return ::cosh(x); }
+inline double copysign(double x, double y) { return ::copysign(x, y); }
+inline double exp(double x) { return ::exp(x); }
+inline double exp2(double x) { return ::exp2(x); }
+inline double expm1(double x) { return ::expm1(x); }
+inline double fabs(double x) { return ::fabs(x); }
+inline double fdim(double x, double y) { return ::fdim(x, y); }
+inline double floor(double x) { return ::floor(x); }
+inline double fma(double x, double y, double z) { return ::fma(x, y, z); }
+inline double fmax(double x, double y) { return ::fmax(x, y); }
+inline double fmin(double x, double y) { return ::fmin(x, y); }
+inline double fmod(double x, double y) { return ::fmod(x, y); }
+inline double frexp(double x, int *r) { return ::frexp(x, r); }
+inline double hypot(double x, double y) { return ::hypot(x, y); }
+inline int ilogb(double x) { return ::ilogb(x); }
+inline bool isfinite(double x) { return libc_isfinite(x); }
+inline bool isinf(double x) { return libc_isinf(x); }
+inline bool isnan(double x) { return libc_isnan(x); }
+inline bool isnormal(double x) { return libc_isnormal(x); }
+inline double ldexp(double x, int n) { return ::ldexp(x, n); }
+inline long long llrint(double x) { return ::llrint(x); }
+inline double log(double x) { return ::log(x); }
+inline double log10(double x) { return ::log10(x); }
+inline double log1p(double x) { return ::log1p(x); }
+inline double log2(double x) { return ::log2(x); }
+inline long lrint(double x) { return ::lrint(x); }
+inline double nextafter(double x, double y) { return ::nextafter(x, y); }
+inline double pow(double x, double y) { return ::pow(x, y); }
+inline double remainder(double x, double y) { return ::remainder(x, y); }
+inline double rint(double x) { return ::rint(x); }
+inline double round(double x) { return ::round(x); }
+inline bool signbit(double x) { return libc_signbit(x); }
+inline double sin(double x) { return ::sin(x); }
+inline double sinh(double x) { return ::sinh(x); }
+inline double sqrt(double x) { return ::sqrt(x); }
+inline double tan(double x) { return ::tan(x); }
+inline double tanh(double x) { return ::tanh(x); }
+inline double trunc(double x) { return ::trunc(x); }
 }
 
 #endif
 
+namespace vecmathlib {
+
+struct fp8 {
+  // 1 bit sign, 4 bits exponent, 3 bits mantissa, exponent offset 7 (?)
+  vml_std::uint8_t val;
+  fp8() {}
+  fp8(double x) { __builtin_unreachable(); }
+};
 
+struct fp16 {
+  // 1 bit sign, 5 bits exponent, 10 bits mantissa, exponent offset 15 (?)
+  vml_std::uint16_t val;
+  fp16() {}
+  fp16(double x) { __builtin_unreachable(); }
+};
 
-namespace vecmathlib {
-  
-  struct fp8 {
-    // 1 bit sign, 4 bits exponent, 3 bits mantissa, exponent offset 7 (?)
-    vml_std::uint8_t val;
-    fp8() {}
-    fp8(double x) { __builtin_unreachable(); }
-  };
-  
-  struct fp16 {
-    // 1 bit sign, 5 bits exponent, 10 bits mantissa, exponent offset 15 (?)
-    vml_std::uint16_t val;
-    fp16() {}
-    fp16(double x) { __builtin_unreachable(); }
-  };
-  
 } // namespace vecmathlib
 
-#endif  // #ifndef FLOATTYPES_H
+#endif // #ifndef FLOATTYPES_H
diff --git a/instantiations.cc b/instantiations.cc
index 9bd5351..956e1b9 100644
--- a/instantiations.cc
+++ b/instantiations.cc
@@ -7,84 +7,105 @@
 
 #include "vecmathlib.h"
 
+namespace vecmathlib {
 
+template <typename realvec_t, int n>
+typename realvec_t::real_t get_elt(realvec_t x) {
+  return x[n];
+}
+template <typename realvec_t, int n>
+realvec_t set_elt(realvec_t x, typename realvec_t::real_t a) {
+  return x.set_elt(n, a);
+}
+
+// template realbuiltinvec<float,1> fabs(realbuiltinvec<float,1> x);
+// template realbuiltinvec<float,1> fmin(realbuiltinvec<float,1> x,
+// realbuiltinvec<float,1> y);
+// template intbuiltinvec<float,1> lsr(intbuiltinvec<float,1> x,
+// intbuiltinvec<float,1>::int_t n);
+// template intbuiltinvec<double,1> lsr(intbuiltinvec<double,1> x,
+// intbuiltinvec<double,1>::int_t n);
+// template intbuiltinvec<double,2> lsr(intbuiltinvec<double,2> x,
+// intbuiltinvec<double,2>::int_t n);
+// template intbuiltinvec<double,2> lsr(intbuiltinvec<double,2> x,
+// intbuiltinvec<double,2> n);
+// template realbuiltinvec<float,1> ifthen(realbuiltinvec<float,1>::boolvec_t c,
+// realbuiltinvec<float,1> x, realbuiltinvec<float,1> y);
+// template realbuiltinvec<double,1> ifthen(realbuiltinvec<double,1>::boolvec_t
+// c, realbuiltinvec<double,1> x, realbuiltinvec<double,1> y);
+// template realbuiltinvec<float,4> ifthen(realbuiltinvec<float,4>::boolvec_t c,
+// realbuiltinvec<float,4> x, realbuiltinvec<float,4> y);
+// template realbuiltinvec<double,2> ifthen(realbuiltinvec<double,2>::boolvec_t
+// c, realbuiltinvec<double,2> x, realbuiltinvec<double,2> y);
 
-namespace vecmathlib {
-  
-  template<typename realvec_t, int n>
-  typename realvec_t::real_t get_elt(realvec_t x)
-  {
-    return x[n];
-  }
-  template<typename realvec_t, int n>
-  realvec_t set_elt(realvec_t x, typename realvec_t::real_t a)
-  {
-    return x.set_elt(n, a);
-  }
-  
-  // template realbuiltinvec<float,1> fabs(realbuiltinvec<float,1> x);
-  // template realbuiltinvec<float,1> fmin(realbuiltinvec<float,1> x, realbuiltinvec<float,1> y);
-  // template intbuiltinvec<float,1> lsr(intbuiltinvec<float,1> x, intbuiltinvec<float,1>::int_t n);
-  // template intbuiltinvec<double,1> lsr(intbuiltinvec<double,1> x, intbuiltinvec<double,1>::int_t n);
-  // template intbuiltinvec<double,2> lsr(intbuiltinvec<double,2> x, intbuiltinvec<double,2>::int_t n);
-  // template intbuiltinvec<double,2> lsr(intbuiltinvec<double,2> x, intbuiltinvec<double,2> n);
-  // template realbuiltinvec<float,1> ifthen(realbuiltinvec<float,1>::boolvec_t c, realbuiltinvec<float,1> x, realbuiltinvec<float,1> y);
-  // template realbuiltinvec<double,1> ifthen(realbuiltinvec<double,1>::boolvec_t c, realbuiltinvec<double,1> x, realbuiltinvec<double,1> y);
-  // template realbuiltinvec<float,4> ifthen(realbuiltinvec<float,4>::boolvec_t c, realbuiltinvec<float,4> x, realbuiltinvec<float,4> y);
-  // template realbuiltinvec<double,2> ifthen(realbuiltinvec<double,2>::boolvec_t c, realbuiltinvec<double,2> x, realbuiltinvec<double,2> y);
-  
 #ifdef VECMATHLIB_HAVE_VEC_FLOAT_1
-  template realvec<float,1> round(realvec<float,1> x);
+template realvec<float, 1> round(realvec<float, 1> x);
 #endif
-  
+
 #ifdef VECMATHLIB_HAVE_VEC_FLOAT_8
-  template intvec<float,8> popcount(intvec<float,8>);
+template intvec<float, 8> popcount(intvec<float, 8>);
 #endif
-  
+
 #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_1
-  template realvec<double,1> exp(realvec<double,1> x);
-  template realvec<double,1> log(realvec<double,1> x);
-  template realvec<double,1> sin(realvec<double,1> x);
-  template realvec<double,1> sqrt(realvec<double,1> x);
-  template realvec<double,1>::real_t get_elt<realvec<double,1>,0>(realvec<double,1> x);
-  template realvec<double,1> set_elt<realvec<double,1>,0>(realvec<double,1> x, realvec<double,1>::real_t a);
+template realvec<double, 1> exp(realvec<double, 1> x);
+template realvec<double, 1> log(realvec<double, 1> x);
+template realvec<double, 1> sin(realvec<double, 1> x);
+template realvec<double, 1> sqrt(realvec<double, 1> x);
+template realvec<double, 1>::real_t
+get_elt<realvec<double, 1>, 0>(realvec<double, 1> x);
+template realvec<double, 1>
+set_elt<realvec<double, 1>, 0>(realvec<double, 1> x,
+                               realvec<double, 1>::real_t a);
 #endif
-  
+
 #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_2
-  template realvec<double,2> exp(realvec<double,2> x);
-  template realvec<double,2> log(realvec<double,2> x);
-  template realvec<double,2> sin(realvec<double,2> x);
-  template realvec<double,2> sqrt(realvec<double,2> x);
-  template realvec<double,2>::real_t get_elt<realvec<double,2>,0>(realvec<double,2>);
-  template realvec<double,2>::real_t get_elt<realvec<double,2>,1>(realvec<double,2>);
-  template realvec<double,2> set_elt<realvec<double,2>,0>(realvec<double,2> x, realvec<double,2>::real_t a);
-  template realvec<double,2> set_elt<realvec<double,2>,1>(realvec<double,2> x, realvec<double,2>::real_t a);
+template realvec<double, 2> exp(realvec<double, 2> x);
+template realvec<double, 2> log(realvec<double, 2> x);
+template realvec<double, 2> sin(realvec<double, 2> x);
+template realvec<double, 2> sqrt(realvec<double, 2> x);
+template realvec<double, 2>::real_t
+get_elt<realvec<double, 2>, 0>(realvec<double, 2>);
+template realvec<double, 2>::real_t
+get_elt<realvec<double, 2>, 1>(realvec<double, 2>);
+template realvec<double, 2>
+set_elt<realvec<double, 2>, 0>(realvec<double, 2> x,
+                               realvec<double, 2>::real_t a);
+template realvec<double, 2>
+set_elt<realvec<double, 2>, 1>(realvec<double, 2> x,
+                               realvec<double, 2>::real_t a);
 #endif
-  
+
 #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_4
-  template realvec<double,4> exp(realvec<double,4> x);
-  template realvec<double,4> log(realvec<double,4> x);
-  template realvec<double,4> sin(realvec<double,4> x);
-  template realvec<double,4> sqrt(realvec<double,4> x);
-  template realvec<double,4>::real_t get_elt<realvec<double,4>,0>(realvec<double,4>);
-  template realvec<double,4>::real_t get_elt<realvec<double,4>,1>(realvec<double,4>);
-  template realvec<double,4>::real_t get_elt<realvec<double,4>,2>(realvec<double,4>);
-  template realvec<double,4>::real_t get_elt<realvec<double,4>,3>(realvec<double,4>);
-  template realvec<double,4> set_elt<realvec<double,4>,0>(realvec<double,4> x, realvec<double,4>::real_t a);
-  template realvec<double,4> set_elt<realvec<double,4>,1>(realvec<double,4> x, realvec<double,4>::real_t a);
-  template realvec<double,4> set_elt<realvec<double,4>,2>(realvec<double,4> x, realvec<double,4>::real_t a);
-  template realvec<double,4> set_elt<realvec<double,4>,3>(realvec<double,4> x, realvec<double,4>::real_t a);
-  template intvec<double,4> popcount(intvec<double,4>);
+template realvec<double, 4> exp(realvec<double, 4> x);
+template realvec<double, 4> log(realvec<double, 4> x);
+template realvec<double, 4> sin(realvec<double, 4> x);
+template realvec<double, 4> sqrt(realvec<double, 4> x);
+template realvec<double, 4>::real_t
+get_elt<realvec<double, 4>, 0>(realvec<double, 4>);
+template realvec<double, 4>::real_t
+get_elt<realvec<double, 4>, 1>(realvec<double, 4>);
+template realvec<double, 4>::real_t
+get_elt<realvec<double, 4>, 2>(realvec<double, 4>);
+template realvec<double, 4>::real_t
+get_elt<realvec<double, 4>, 3>(realvec<double, 4>);
+template realvec<double, 4>
+set_elt<realvec<double, 4>, 0>(realvec<double, 4> x,
+                               realvec<double, 4>::real_t a);
+template realvec<double, 4>
+set_elt<realvec<double, 4>, 1>(realvec<double, 4> x,
+                               realvec<double, 4>::real_t a);
+template realvec<double, 4>
+set_elt<realvec<double, 4>, 2>(realvec<double, 4> x,
+                               realvec<double, 4>::real_t a);
+template realvec<double, 4>
+set_elt<realvec<double, 4>, 3>(realvec<double, 4> x,
+                               realvec<double, 4>::real_t a);
+template intvec<double, 4> popcount(intvec<double, 4>);
 #endif
-  
 }
 
-
-
 // Various tests to detect auto-vectorization features
 
-
-
 #include <cassert>
 #include <cstdlib>
 using namespace std;
@@ -92,32 +113,25 @@ using namespace std;
 using namespace vecmathlib;
 
 #if defined VECMATHLIB_HAVE_VEC_DOUBLE_4
-typedef realvec<double,4> realV;
+typedef realvec<double, 4> realV;
 #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2
-typedef realvec<double,2> realV;
+typedef realvec<double, 2> realV;
 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_8
-typedef realvec<float,8> realV;
+typedef realvec<float, 8> realV;
 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_4
-typedef realvec<float,4> realV;
+typedef realvec<float, 4> realV;
 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_2
-typedef realvec<float,2> realV;
+typedef realvec<float, 2> realV;
 #else
-#  error "There are no vector types"
+#error "There are no vector types"
 #endif
 
 typedef realV::scalar_t real;
 const int vecsize = realV::size;
 
-
-
 // Simple, naive loop adding two arrays
-extern "C"
-void loop_add(real* a,
-              real* b,
-              real* c,
-              ptrdiff_t n)
-{
-  for (ptrdiff_t i=0; i<n; i+=vecsize) {
+extern "C" void loop_add(real *a, real *b, real *c, ptrdiff_t n) {
+  for (ptrdiff_t i = 0; i < n; i += vecsize) {
     realV tmpb = realV::loadu(&b[i]);
     realV tmpc = realV::loadu(&c[i]);
     realV tmpa = tmpb + tmpc;
@@ -125,16 +139,10 @@ void loop_add(real* a,
   }
 }
 
-
-
 // Declare pointers as restrict
-extern "C"
-void loop_add_restrict(real *restrict a,
-                       real *restrict b,
-                       real *restrict c,
-                       ptrdiff_t n)
-{
-  for (ptrdiff_t i=0; i<n; i+=vecsize) {
+extern "C" void loop_add_restrict(real *restrict a, real *restrict b,
+                                  real *restrict c, ptrdiff_t n) {
+  for (ptrdiff_t i = 0; i < n; i += vecsize) {
     realV tmpb = realV::loadu(&b[i]);
     realV tmpc = realV::loadu(&c[i]);
     realV tmpa = tmpb + tmpc;
@@ -142,16 +150,10 @@ void loop_add_restrict(real *restrict a,
   }
 }
 
-
-
 // Declare pointers as restrict and aligned
-extern "C"
-void loop_add_aligned(real *restrict a,
-                      real *restrict b,
-                      real *restrict c,
-                      ptrdiff_t n)
-{
-  for (ptrdiff_t i=0; i<n; i+=vecsize) {
+extern "C" void loop_add_aligned(real *restrict a, real *restrict b,
+                                 real *restrict c, ptrdiff_t n) {
+  for (ptrdiff_t i = 0; i < n; i += vecsize) {
     realV tmpb = realV::loada(&b[i]);
     realV tmpc = realV::loada(&c[i]);
     realV tmpa = tmpb + tmpc;
@@ -159,16 +161,11 @@ void loop_add_aligned(real *restrict a,
   }
 }
 
-
-
 // Reduction loop
-extern "C"
-real loop_dot_reduce(real *restrict a,
-                     real *restrict b,
-                     ptrdiff_t n)
-{
+extern "C" real loop_dot_reduce(real *restrict a, real *restrict b,
+                                ptrdiff_t n) {
   realV sumV = 0.0;
-  for (ptrdiff_t i=0; i<n; i+=vecsize) {
+  for (ptrdiff_t i = 0; i < n; i += vecsize) {
     realV tmpa = realV::loada(&a[i]);
     realV tmpb = realV::loada(&b[i]);
     sumV += tmpa * tmpb;
@@ -176,16 +173,10 @@ real loop_dot_reduce(real *restrict a,
   return sum(sumV);
 }
 
-
-
 // Loop with a simple if condition (fmax)
-extern "C"
-void loop_if_simple(real *restrict a,
-                    real *restrict b,
-                    real *restrict c,
-                    ptrdiff_t n)
-{
-  for (ptrdiff_t i=0; i<n; i+=vecsize) {
+extern "C" void loop_if_simple(real *restrict a, real *restrict b,
+                               real *restrict c, ptrdiff_t n) {
+  for (ptrdiff_t i = 0; i < n; i += vecsize) {
     realV tmpb = realV::loada(&b[i]);
     realV tmpc = realV::loada(&c[i]);
     realV tmpa = ifthen(tmpb > tmpc, tmpb, tmpc);
@@ -193,16 +184,10 @@ void loop_if_simple(real *restrict a,
   }
 }
 
-
-
 // Loop with a complex if condition (select)
-extern "C"
-void loop_if(real *restrict a,
-             real *restrict b,
-             real *restrict c,
-             ptrdiff_t n)
-{
-  for (ptrdiff_t i=0; i<n; i+=vecsize) {
+extern "C" void loop_if(real *restrict a, real *restrict b, real *restrict c,
+                        ptrdiff_t n) {
+  for (ptrdiff_t i = 0; i < n; i += vecsize) {
     realV tmpb = realV::loada(&b[i]);
     realV tmpc = realV::loada(&c[i]);
     realV tmpa = ifthen(tmpb > realV(0.0), tmpb * tmpc, realV(1.0));
@@ -210,16 +195,10 @@ void loop_if(real *restrict a,
   }
 }
 
-
-
 // Skip ghost points
-extern "C"
-void loop_add_masked(real *restrict a,
-                     real *restrict b,
-                     real *restrict c,
-                     ptrdiff_t n)
-{
-  for (realV::mask_t mask(1, n-1, 0); mask; ++mask) {
+extern "C" void loop_add_masked(real *restrict a, real *restrict b,
+                                real *restrict c, ptrdiff_t n) {
+  for (realV::mask_t mask(1, n - 1, 0); mask; ++mask) {
     ptrdiff_t i = mask.index();
     realV tmpb = realV::loada(&b[i]);
     realV tmpc = realV::loada(&c[i]);
diff --git a/interp.cc b/interp.cc
index 12bac0e..95e2cfa 100644
--- a/interp.cc
+++ b/interp.cc
@@ -13,12 +13,8 @@ typedef realvec_t::real_t real_t;
 typedef realvec_t::intvec_t intvec_t;
 typedef intvec_t::int_t int_t;
 
-
-
-realvec_t interp(const real_t* array, ptrdiff_t size,
-                 real_t xmin, real_t xmax,
-                 realvec_t x)
-{
+realvec_t interp(const real_t *array, ptrdiff_t size, real_t xmin, real_t xmax,
+                 realvec_t x) {
   assert(size >= 2);
   // spacing
   real_t dx = (xmax - xmin) / (size - 1);
@@ -29,11 +25,11 @@ realvec_t interp(const real_t* array, ptrdiff_t size,
   intvec_t n = convert_int(cell);
   // gather values from array
   realvec_t x0, x1;
-  for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+  for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
     // ensure location is not out of bounds
-    ptrdiff_t j = max(ptrdiff_t(0), min(size-2, ptrdiff_t(n[i])));
+    ptrdiff_t j = max(ptrdiff_t(0), min(size - 2, ptrdiff_t(n[i])));
     x0.set_elt(i, array[j]);
-    x1.set_elt(i, array[j+1]);
+    x1.set_elt(i, array[j + 1]);
   }
   // determine interpolation weights
   realvec_t offset = scaled - cell;
@@ -44,20 +40,18 @@ realvec_t interp(const real_t* array, ptrdiff_t size,
   return y;
 }
 
-
-
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
   ptrdiff_t size = 1001;
   vector<real_t> array(size);
-  for (ptrdiff_t i=0; i<size; ++i) array[i] = real_t(i) / 1000.0;
-  
+  for (ptrdiff_t i = 0; i < size; ++i)
+    array[i] = real_t(i) / 1000.0;
+
   real_t xmin = 0.0;
   real_t xmax = 0.5;
   realvec_t x = 0.333;
   cout << "x=" << x << "\n";
   realvec_t y = interp(&array[0], size, xmin, xmax, x);
   cout << "y=" << y << "\n";
-  
+
   return 0;
 }
diff --git a/loop.cc b/loop.cc
index ca8ebb8..8b42970 100644
--- a/loop.cc
+++ b/loop.cc
@@ -14,68 +14,57 @@
 using namespace std;
 using namespace vecmathlib;
 
-
-
 ////////////////////////////////////////////////////////////////////////////////
 // Helpers
 ////////////////////////////////////////////////////////////////////////////////
 
 #ifndef __has_builtin
-#  define __has_builtin(x) 0 // Compatibility with non-clang compilers
+#define __has_builtin(x) 0 // Compatibility with non-clang compilers
 #endif
 
 // align upwards
-static size_t align_up(size_t i, size_t size)
-{
+static size_t align_up(size_t i, size_t size) {
   return (i + size - 1) / size * size;
 }
 
-
-
 ////////////////////////////////////////////////////////////////////////////////
 // High-resolution timer
 ////////////////////////////////////////////////////////////////////////////////
 
 typedef unsigned long long ticks;
-inline ticks getticks()
-{
+inline ticks getticks() {
 #if __has_builtin(__builtin_readcyclecounter)
   return __builtin_readcyclecounter();
 #elif defined __x86_64__
   ticks a, d;
-  asm volatile("rdtsc" : "=a" (a), "=d" (d));
+  asm volatile("rdtsc" : "=a"(a), "=d"(d));
   return a | (d << 32);
 #elif defined __powerpc__
   unsigned int tbl, tbu, tbu1;
   do {
-    asm volatile("mftbu %0": "=r"(tbu));
-    asm volatile("mftb %0": "=r"(tbl));
-    asm volatile("mftbu %0": "=r"(tbu1));
+    asm volatile("mftbu %0" : "=r"(tbu));
+    asm volatile("mftb %0" : "=r"(tbl));
+    asm volatile("mftbu %0" : "=r"(tbu1));
   } while (tbu != tbu1);
   return ((unsigned long long)tbu << 32) | tbl;
 #else
   timeval tv;
   gettimeofday(&tv, NULL);
   return 1000000ULL * tv.tv_sec + tv.tv_usec;
-  // timespec ts;
-  // clock_gettime(CLOCK_REALTIME, &ts);
-  // return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
+// timespec ts;
+// clock_gettime(CLOCK_REALTIME, &ts);
+// return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
 #endif
 }
-inline double elapsed(ticks t1, ticks t0)
-{
-  return t1-t0;
-}
+inline double elapsed(ticks t1, ticks t0) { return t1 - t0; }
 
-double get_sys_time()
-{
+double get_sys_time() {
   timeval tp;
   gettimeofday(&tp, NULL);
   return tp.tv_sec + 1.0e-6 * tp.tv_usec;
 }
 
-double measure_tick()
-{
+double measure_tick() {
   ticks const rstart = getticks();
   double const wstart = get_sys_time();
   while (get_sys_time() - wstart < 0.1) {
@@ -83,236 +72,219 @@ double measure_tick()
   }
   ticks const rend = getticks();
   double const wend = get_sys_time();
-  assert(wend-wstart >= 0.09);
+  assert(wend - wstart >= 0.09);
   return (wend - wstart) / elapsed(rend, rstart);
 }
 
-
-
 ////////////////////////////////////////////////////////////////////////////////
 // Initialize the grid
 ////////////////////////////////////////////////////////////////////////////////
 
-template<typename realvec_t>
-void init(typename realvec_t::real_t *restrict xptr,
-          ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n)
-{
-  for (ptrdiff_t j=0; j<n; ++j) {
-    for (ptrdiff_t i=0; i<m; ++i) {
-      const ptrdiff_t ij = ldm*j + i;
-      xptr[ij] = (i+j)%2;
+template <typename realvec_t>
+void init(typename realvec_t::real_t *restrict xptr, ptrdiff_t m, ptrdiff_t ldm,
+          ptrdiff_t n) {
+  for (ptrdiff_t j = 0; j < n; ++j) {
+    for (ptrdiff_t i = 0; i < m; ++i) {
+      const ptrdiff_t ij = ldm * j + i;
+      xptr[ij] = (i + j) % 2;
     }
   }
 }
 
-
-
 ////////////////////////////////////////////////////////////////////////////////
 // Evolution loop: Simple stencil example (Gaussian smoothing)
 ////////////////////////////////////////////////////////////////////////////////
 
 // Introduce a delay, so that cache access is not so important
-template<typename T>
-static T delay(const T x)
-{
+template <typename T> static T delay(const T x) {
   return x;
   // return log(exp(x));
 }
 
 // Original version, unvectorized
-template<typename realvec_t>
+template <typename realvec_t>
 void smooth_scalar(typename realvec_t::real_t const *restrict xptr,
-                   typename realvec_t::real_t *restrict yptr,
-                   ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n)
-{
+                   typename realvec_t::real_t *restrict yptr, ptrdiff_t m,
+                   ptrdiff_t ldm, ptrdiff_t n) {
   typedef typename realvec_t::real_t real_t;
-  for (ptrdiff_t j=1; j<n-1; ++j) {
-    for (ptrdiff_t i=1; i<m-1; ++i) {
-      const ptrdiff_t ij = ldm*j + i;
-      const real_t x   = xptr[ij];
-      const real_t xil = xptr[ij-1];
-      const real_t xir = xptr[ij+1];
-      const real_t xjl = xptr[ij-ldm];
-      const real_t xjr = xptr[ij+ldm];
+  for (ptrdiff_t j = 1; j < n - 1; ++j) {
+    for (ptrdiff_t i = 1; i < m - 1; ++i) {
+      const ptrdiff_t ij = ldm * j + i;
+      const real_t x = xptr[ij];
+      const real_t xil = xptr[ij - 1];
+      const real_t xir = xptr[ij + 1];
+      const real_t xjl = xptr[ij - ldm];
+      const real_t xjr = xptr[ij + ldm];
       const real_t y =
-        real_t(0.5) * x + real_t(0.125) * (xil + xir + xjl + xjr);
+          real_t(0.5) * x + real_t(0.125) * (xil + xir + xjl + xjr);
       yptr[ij] = delay(y);
     }
   }
 }
 
-
-
 // Assuming no particular alignment
-template<typename realvec_t>
+template <typename realvec_t>
 void smooth_unaligned(typename realvec_t::real_t const *restrict xptr,
-                      typename realvec_t::real_t *restrict yptr,
-                      ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n)
-{
+                      typename realvec_t::real_t *restrict yptr, ptrdiff_t m,
+                      ptrdiff_t ldm, ptrdiff_t n) {
   typedef typename realvec_t::real_t real_t;
   typedef typename realvec_t::mask_t mask_t;
-  for (ptrdiff_t j=1; j<n-1; ++j) {
+  for (ptrdiff_t j = 1; j < n - 1; ++j) {
     // Desired loop bounds
     const ptrdiff_t imin = 1;
-    const ptrdiff_t imax = m-1;
+    const ptrdiff_t imax = m - 1;
     // Align actual loop iterations with vector size
-    const ptrdiff_t ioff = ldm*j;
+    const ptrdiff_t ioff = ldm * j;
     for (mask_t mask(imin, imax, ioff); mask; ++mask) {
       const ptrdiff_t i = mask.index();
       const ptrdiff_t ij = ioff + i;
-      const realvec_t x   = realvec_t::loadu(xptr+ij);
-      const realvec_t xil = realvec_t::loadu(xptr+ij, -1);
-      const realvec_t xir = realvec_t::loadu(xptr+ij, +1);
-      const realvec_t xjl = realvec_t::loadu(xptr+ij-ldm);
-      const realvec_t xjr = realvec_t::loadu(xptr+ij+ldm);
-      const realvec_t y =
-        realvec_t(real_t(0.5)) * x +
-        realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
-      storeu(delay(y), yptr+ij, mask);
+      const realvec_t x = realvec_t::loadu(xptr + ij);
+      const realvec_t xil = realvec_t::loadu(xptr + ij, -1);
+      const realvec_t xir = realvec_t::loadu(xptr + ij, +1);
+      const realvec_t xjl = realvec_t::loadu(xptr + ij - ldm);
+      const realvec_t xjr = realvec_t::loadu(xptr + ij + ldm);
+      const realvec_t y = realvec_t(real_t(0.5)) * x +
+                          realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
+      storeu(delay(y), yptr + ij, mask);
     }
   }
 }
 
-
-
 // Assuming that xptr and yptr are aligned, but ldm can be arbitrary
-template<typename realvec_t>
+template <typename realvec_t>
 void smooth_aligned(typename realvec_t::real_t const *restrict xptr,
-                    typename realvec_t::real_t *restrict yptr,
-                    ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n)
-{
+                    typename realvec_t::real_t *restrict yptr, ptrdiff_t m,
+                    ptrdiff_t ldm, ptrdiff_t n) {
   typedef typename realvec_t::real_t real_t;
   typedef typename realvec_t::mask_t mask_t;
-  for (ptrdiff_t j=1; j<n-1; ++j) {
+  for (ptrdiff_t j = 1; j < n - 1; ++j) {
     // Desired loop bounds
     const ptrdiff_t imin = 1;
-    const ptrdiff_t imax = m-1;
+    const ptrdiff_t imax = m - 1;
     // Align actual loop iterations with vector size
-    const ptrdiff_t ioff = ldm*j;
+    const ptrdiff_t ioff = ldm * j;
     for (mask_t mask(imin, imax, ioff); mask; ++mask) {
       const ptrdiff_t i = mask.index();
       const ptrdiff_t ij = ioff + i;
-      const realvec_t x   = realvec_t::loada(xptr+ij);
-      const realvec_t xil = realvec_t::loadu(xptr+ij, -1);
-      const realvec_t xir = realvec_t::loadu(xptr+ij, +1);
-      const realvec_t xjl = realvec_t::loadu(xptr+ij-ldm);
-      const realvec_t xjr = realvec_t::loadu(xptr+ij+ldm);
-      const realvec_t y =
-        realvec_t(real_t(0.5)) * x +
-        realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
-      storea(delay(y), yptr+ij, mask);
+      const realvec_t x = realvec_t::loada(xptr + ij);
+      const realvec_t xil = realvec_t::loadu(xptr + ij, -1);
+      const realvec_t xir = realvec_t::loadu(xptr + ij, +1);
+      const realvec_t xjl = realvec_t::loadu(xptr + ij - ldm);
+      const realvec_t xjr = realvec_t::loadu(xptr + ij + ldm);
+      const realvec_t y = realvec_t(real_t(0.5)) * x +
+                          realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
+      storea(delay(y), yptr + ij, mask);
     }
   }
 }
 
-
-
 // Assuming that xptr and yptr are aligned, and ldm is a multiple of
 // the vector size
-template<typename realvec_t>
+template <typename realvec_t>
 void smooth_padded(typename realvec_t::real_t const *restrict xptr,
-                   typename realvec_t::real_t *restrict yptr,
-                   ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n)
-{
+                   typename realvec_t::real_t *restrict yptr, ptrdiff_t m,
+                   ptrdiff_t ldm, ptrdiff_t n) {
   typedef typename realvec_t::real_t real_t;
   typedef typename realvec_t::mask_t mask_t;
   assert(ldm % realvec_t::size == 0);
-  for (ptrdiff_t j=1; j<n-1; ++j) {
+  for (ptrdiff_t j = 1; j < n - 1; ++j) {
     // Desired loop bounds
     const ptrdiff_t imin = 1;
-    const ptrdiff_t imax = m-1;
+    const ptrdiff_t imax = m - 1;
     // Align actual loop iterations with vector size
-    const ptrdiff_t ioff = ldm*j;
+    const ptrdiff_t ioff = ldm * j;
     for (mask_t mask(imin, imax, ioff); mask; ++mask) {
       const ptrdiff_t i = mask.index();
       const ptrdiff_t ij = ioff + i;
-      const realvec_t x   = realvec_t::loada(xptr+ij);
-      const realvec_t xil = realvec_t::loadu(xptr+ij, -1);
-      const realvec_t xir = realvec_t::loadu(xptr+ij, +1);
-      const realvec_t xjl = realvec_t::loada(xptr+ij-ldm);
-      const realvec_t xjr = realvec_t::loada(xptr+ij+ldm);
-      const realvec_t y =
-        realvec_t(real_t(0.5)) * x +
-        realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
-      storea(delay(y), yptr+ij, mask);
+      const realvec_t x = realvec_t::loada(xptr + ij);
+      const realvec_t xil = realvec_t::loadu(xptr + ij, -1);
+      const realvec_t xir = realvec_t::loadu(xptr + ij, +1);
+      const realvec_t xjl = realvec_t::loada(xptr + ij - ldm);
+      const realvec_t xjr = realvec_t::loada(xptr + ij + ldm);
+      const realvec_t y = realvec_t(real_t(0.5)) * x +
+                          realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
+      storea(delay(y), yptr + ij, mask);
     }
   }
 }
 
-
-
 ////////////////////////////////////////////////////////////////////////////////
 // Main routine
 ////////////////////////////////////////////////////////////////////////////////
 
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
   // Number of iterations
   const int niters = 100;
-  
+
   // Grid size
   const ptrdiff_t m = 100;
   const ptrdiff_t n = 100;
-  
-  // Choose a vector size
+
+// Choose a vector size
 #if defined VECMATHLIB_HAVE_VEC_DOUBLE_4
-  typedef realvec<double,4> realvec_t;
+  typedef realvec<double, 4> realvec_t;
 #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2
-  typedef realvec<double,2> realvec_t;
+  typedef realvec<double, 2> realvec_t;
 #else
-  typedef realpseudovec<double,1> realvec_t;
+  typedef realpseudovec<double, 1> realvec_t;
 #endif
-  
+
   // Ensure the grid size is aligned
   const ptrdiff_t ldm = align_up(m, realvec_t::size);
   typedef realvec_t::real_t real_t;
-  vector<real_t> x0(ldm*n + realvec_t::size-1), y0(ldm*n + realvec_t::size-1);
-  real_t* restrict const x =
-    (real_t*)align_up(intptr_t(&x0[0]), sizeof(realvec_t));
-  real_t* restrict const y =
-    (real_t*)align_up(intptr_t(&y0[0]), sizeof(realvec_t));
-  for (ptrdiff_t i=0; i<ldm*n; ++i) y[i] = 0.0;
-  
+  vector<real_t> x0(ldm * n + realvec_t::size - 1),
+      y0(ldm * n + realvec_t::size - 1);
+  real_t *restrict const x =
+      (real_t *)align_up(intptr_t(&x0[0]), sizeof(realvec_t));
+  real_t *restrict const y =
+      (real_t *)align_up(intptr_t(&y0[0]), sizeof(realvec_t));
+  for (ptrdiff_t i = 0; i < ldm * n; ++i)
+    y[i] = 0.0;
+
   // Initialize
   init<realvec_t>(&x[0], m, ldm, n);
-  
+
   // Timers
   ticks t0, t1;
   double const cycles_per_tick = 1.0; // measure_tick();
   double cycles;
-  
+
   // Run the different evolution loop versions
   t0 = getticks();
-  for (int iter=0; iter<niters; ++iter) {
+  for (int iter = 0; iter < niters; ++iter) {
     smooth_scalar<realvec_t>(&x[0], &y[0], m, ldm, n);
   }
   t1 = getticks();
-  cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters);
+  cycles =
+      cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters);
   cout << "smooth_scalar:    " << cycles << " cycles/point\n";
-  
+
   t0 = getticks();
-  for (int iter=0; iter<niters; ++iter) {
+  for (int iter = 0; iter < niters; ++iter) {
     smooth_unaligned<realvec_t>(&x[0], &y[0], m, ldm, n);
   }
   t1 = getticks();
-  cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters);
+  cycles =
+      cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters);
   cout << "smooth_unaligned: " << cycles << " cycles/point\n";
-  
+
   t0 = getticks();
-  for (int iter=0; iter<niters; ++iter) {
+  for (int iter = 0; iter < niters; ++iter) {
     smooth_aligned<realvec_t>(&x[0], &y[0], m, ldm, n);
   }
   t1 = getticks();
-  cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters);
+  cycles =
+      cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters);
   cout << "smooth_aligned:   " << cycles << " cycles/point\n";
-  
+
   t0 = getticks();
-  for (int iter=0; iter<niters; ++iter) {
+  for (int iter = 0; iter < niters; ++iter) {
     smooth_padded<realvec_t>(&x[0], &y[0], m, ldm, n);
   }
   t1 = getticks();
-  cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters);
+  cycles =
+      cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters);
   cout << "smooth_padded:    " << cycles << " cycles/point\n";
-  
+
   return 0;
 }
diff --git a/mathfuncs.h b/mathfuncs.h
index 8d90f9a..9f042d1 100644
--- a/mathfuncs.h
+++ b/mathfuncs.h
@@ -19,4 +19,4 @@
 #include "mathfuncs_sinh.h"
 #include "mathfuncs_sqrt.h"
 
-#endif  // #ifndef MATHFUNCS_H
+#endif // #ifndef MATHFUNCS_H
diff --git a/mathfuncs_asin.h b/mathfuncs_asin.h
index 3dd9c75..cd174a2 100644
--- a/mathfuncs_asin.h
+++ b/mathfuncs_asin.h
@@ -7,206 +7,181 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+namespace {
 
-namespace vecmathlib {
-  
-  
-  
-  namespace {
-    
-    template<typename realvec_t>
-    realvec_t mulsign(realvec_t x, realvec_t y)
-    {
-      typedef typename realvec_t::real_t real_t;
-      typedef typename realvec_t::intvec_t intvec_t;
-      typedef intvec_t IV;
-      typedef floatprops<real_t> FP;
-      
-      intvec_t value = as_int(x);
-      intvec_t sign = as_int(y) & IV(FP::signbit_mask);
-      return as_float(value ^ sign);
-    }
-    
-    // Note: the order of arguments is y, x, as is convention for atan2
-    template<typename realvec_t>
-    realvec_t atan2k(realvec_t y, realvec_t x)
-    {
-      // Algorithm taken from SLEEF 2.80
-      
-      typedef typename realvec_t::real_t real_t;
-      typedef typename realvec_t::boolvec_t boolvec_t;
-      typedef realvec_t RV;
-      
-      realvec_t q = RV(0.0);
-      
-      q = ifthen(signbit(x), RV(-2.0), q);
-      x = fabs(x);
-      
-      boolvec_t cond = y > x;
-      realvec_t x0 = x;
-      realvec_t y0 = y;
-      x = ifthen(cond,  y0, x0);
-      y = ifthen(cond, -x0, y0);
-      q += ifthen(cond, RV(1.0), RV(0.0));
-      
-      realvec_t s = y / x;
-      realvec_t t = s * s;
-      
-      realvec_t u;
-      switch (sizeof(real_t)) {
-      default: __builtin_unreachable();
-      case sizeof(float):
-        u = RV(0.00282363896258175373077393f);
-        u = mad(u, t, RV(-0.0159569028764963150024414f));
-        u = mad(u, t, RV(0.0425049886107444763183594f));
-        u = mad(u, t, RV(-0.0748900920152664184570312f));
-        u = mad(u, t, RV(0.106347933411598205566406f));
-        u = mad(u, t, RV(-0.142027363181114196777344f));
-        u = mad(u, t, RV(0.199926957488059997558594f));
-        u = mad(u, t, RV(-0.333331018686294555664062f));
-        break;
-      case sizeof(double):
-        u = RV(-1.88796008463073496563746e-05);
-        u = mad(u, t, RV(0.000209850076645816976906797));
-        u = mad(u, t, RV(-0.00110611831486672482563471));
-        u = mad(u, t, RV(0.00370026744188713119232403));
-        u = mad(u, t, RV(-0.00889896195887655491740809));
-        u = mad(u, t, RV(0.016599329773529201970117));
-        u = mad(u, t, RV(-0.0254517624932312641616861));
-        u = mad(u, t, RV(0.0337852580001353069993897));
-        u = mad(u, t, RV(-0.0407629191276836500001934));
-        u = mad(u, t, RV(0.0466667150077840625632675));
-        u = mad(u, t, RV(-0.0523674852303482457616113));
-        u = mad(u, t, RV(0.0587666392926673580854313));
-        u = mad(u, t, RV(-0.0666573579361080525984562));
-        u = mad(u, t, RV(0.0769219538311769618355029));
-        u = mad(u, t, RV(-0.090908995008245008229153));
-        u = mad(u, t, RV(0.111111105648261418443745));
-        u = mad(u, t, RV(-0.14285714266771329383765));
-        u = mad(u, t, RV(0.199999999996591265594148));
-        u = mad(u, t, RV(-0.333333333333311110369124));
-        break;
-      }
-      
-      t = mad(u, t * s, s);
-      t = mad(q, RV(M_PI_2), t);
-      
-      return t;
-    }
-    
-  }
-  
+template <typename realvec_t> realvec_t mulsign(realvec_t x, realvec_t y) {
+  typedef typename realvec_t::real_t real_t;
+  typedef typename realvec_t::intvec_t intvec_t;
+  typedef intvec_t IV;
+  typedef floatprops<real_t> FP;
 
+  intvec_t value = as_int(x);
+  intvec_t sign = as_int(y) & IV(FP::signbit_mask);
+  return as_float(value ^ sign);
+}
 
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_asin(realvec_t d)
-  {
-    // Algorithm taken from SLEEF 2.80
-    return mulsign(atan2k(fabs(d), sqrt((RV(1.0)+d)*(RV(1.0)-d))), d);
-  }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_acos(realvec_t d)
-  {
-    // Algorithm taken from SLEEF 2.80
-    return (mulsign(atan2k(sqrt((RV(1.0)+d)*(RV(1.0)-d)), fabs(d)), d) +
-            ifthen(d < RV(0.0), RV(M_PI), RV(0.0)));
-  }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_atan(realvec_t s)
-  {
-    // Algorithm taken from SLEEF 2.80
-    
-    realvec_t q1 = s;
-    s = fabs(s);
-    
-    boolvec_t q0 = s > RV(1.0);
-    s = ifthen(q0, rcp(s), s);
-    
-    realvec_t t = s * s;
-    
-    realvec_t u;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      u = RV(0.00282363896258175373077393f);
-      u = mad(u, t, RV(-0.0159569028764963150024414f));
-      u = mad(u, t, RV(0.0425049886107444763183594f));
-      u = mad(u, t, RV(-0.0748900920152664184570312f));
-      u = mad(u, t, RV(0.106347933411598205566406f));
-      u = mad(u, t, RV(-0.142027363181114196777344f));
-      u = mad(u, t, RV(0.199926957488059997558594f));
-      u = mad(u, t, RV(-0.333331018686294555664062f));
-      break;
-    case sizeof(double):
-      u = RV(-1.88796008463073496563746e-05);
-      u = mad(u, t, RV(0.000209850076645816976906797));
-      u = mad(u, t, RV(-0.00110611831486672482563471));
-      u = mad(u, t, RV(0.00370026744188713119232403));
-      u = mad(u, t, RV(-0.00889896195887655491740809));
-      u = mad(u, t, RV(0.016599329773529201970117));
-      u = mad(u, t, RV(-0.0254517624932312641616861));
-      u = mad(u, t, RV(0.0337852580001353069993897));
-      u = mad(u, t, RV(-0.0407629191276836500001934));
-      u = mad(u, t, RV(0.0466667150077840625632675));
-      u = mad(u, t, RV(-0.0523674852303482457616113));
-      u = mad(u, t, RV(0.0587666392926673580854313));
-      u = mad(u, t, RV(-0.0666573579361080525984562));
-      u = mad(u, t, RV(0.0769219538311769618355029));
-      u = mad(u, t, RV(-0.090908995008245008229153));
-      u = mad(u, t, RV(0.111111105648261418443745));
-      u = mad(u, t, RV(-0.14285714266771329383765));
-      u = mad(u, t, RV(0.199999999996591265594148));
-      u = mad(u, t, RV(-0.333333333333311110369124));
-      break;
-    }
-    
-    t = s + s * (t * u);
-    
-    t = ifthen(q0, RV(M_PI_2) - t, t);
-    t = copysign(t, q1);
-    
-    return t;
+// Note: the order of arguments is y, x, as is convention for atan2
+template <typename realvec_t> realvec_t atan2k(realvec_t y, realvec_t x) {
+  // Algorithm taken from SLEEF 2.80
+
+  typedef typename realvec_t::real_t real_t;
+  typedef typename realvec_t::boolvec_t boolvec_t;
+  typedef realvec_t RV;
+
+  realvec_t q = RV(0.0);
+
+  q = ifthen(signbit(x), RV(-2.0), q);
+  x = fabs(x);
+
+  boolvec_t cond = y > x;
+  realvec_t x0 = x;
+  realvec_t y0 = y;
+  x = ifthen(cond, y0, x0);
+  y = ifthen(cond, -x0, y0);
+  q += ifthen(cond, RV(1.0), RV(0.0));
+
+  realvec_t s = y / x;
+  realvec_t t = s * s;
+
+  realvec_t u;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    u = RV(0.00282363896258175373077393f);
+    u = mad(u, t, RV(-0.0159569028764963150024414f));
+    u = mad(u, t, RV(0.0425049886107444763183594f));
+    u = mad(u, t, RV(-0.0748900920152664184570312f));
+    u = mad(u, t, RV(0.106347933411598205566406f));
+    u = mad(u, t, RV(-0.142027363181114196777344f));
+    u = mad(u, t, RV(0.199926957488059997558594f));
+    u = mad(u, t, RV(-0.333331018686294555664062f));
+    break;
+  case sizeof(double):
+    u = RV(-1.88796008463073496563746e-05);
+    u = mad(u, t, RV(0.000209850076645816976906797));
+    u = mad(u, t, RV(-0.00110611831486672482563471));
+    u = mad(u, t, RV(0.00370026744188713119232403));
+    u = mad(u, t, RV(-0.00889896195887655491740809));
+    u = mad(u, t, RV(0.016599329773529201970117));
+    u = mad(u, t, RV(-0.0254517624932312641616861));
+    u = mad(u, t, RV(0.0337852580001353069993897));
+    u = mad(u, t, RV(-0.0407629191276836500001934));
+    u = mad(u, t, RV(0.0466667150077840625632675));
+    u = mad(u, t, RV(-0.0523674852303482457616113));
+    u = mad(u, t, RV(0.0587666392926673580854313));
+    u = mad(u, t, RV(-0.0666573579361080525984562));
+    u = mad(u, t, RV(0.0769219538311769618355029));
+    u = mad(u, t, RV(-0.090908995008245008229153));
+    u = mad(u, t, RV(0.111111105648261418443745));
+    u = mad(u, t, RV(-0.14285714266771329383765));
+    u = mad(u, t, RV(0.199999999996591265594148));
+    u = mad(u, t, RV(-0.333333333333311110369124));
+    break;
   }
 
-  
-  
-  // Note: the order of arguments is y, x, as is convention for atan2
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_atan2(realvec_t y, realvec_t x)
-  {
-    // Algorithm taken from SLEEF 2.80
-    
-    realvec_t r = atan2k(fabs(y), x);
-    
-    r = mulsign(r, x);
-    
-    r = ifthen(isinf(x) || x == RV(0.0),
-               ifthen(isinf(x),
-                      RV(M_PI_2) - copysign(RV(M_PI_2), x),
-                      RV(M_PI_2)),
-               r);
-    
-    r = ifthen(isinf(y),
-               ifthen(isinf(x),
-                      RV(M_PI_2) - copysign(RV(M_PI_4), x),
-                      RV(M_PI_2)),
-               r);
-    
-    r = ifthen(y == RV(0.0),
-               ifthen(signbit(x), RV(M_PI), RV(0.0)),
-               r);
-    
-    const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
-    return ifthen(isnan(x) || isnan(y), RV(nan), mulsign(r, y));
+  t = mad(u, t * s, s);
+  t = mad(q, RV(M_PI_2), t);
+
+  return t;
+}
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_asin(realvec_t d) {
+  // Algorithm taken from SLEEF 2.80
+  return mulsign(atan2k(fabs(d), sqrt((RV(1.0) + d) * (RV(1.0) - d))), d);
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_acos(realvec_t d) {
+  // Algorithm taken from SLEEF 2.80
+  return (mulsign(atan2k(sqrt((RV(1.0) + d) * (RV(1.0) - d)), fabs(d)), d) +
+          ifthen(d < RV(0.0), RV(M_PI), RV(0.0)));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_atan(realvec_t s) {
+  // Algorithm taken from SLEEF 2.80
+
+  realvec_t q1 = s;
+  s = fabs(s);
+
+  boolvec_t q0 = s > RV(1.0);
+  s = ifthen(q0, rcp(s), s);
+
+  realvec_t t = s * s;
+
+  realvec_t u;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    u = RV(0.00282363896258175373077393f);
+    u = mad(u, t, RV(-0.0159569028764963150024414f));
+    u = mad(u, t, RV(0.0425049886107444763183594f));
+    u = mad(u, t, RV(-0.0748900920152664184570312f));
+    u = mad(u, t, RV(0.106347933411598205566406f));
+    u = mad(u, t, RV(-0.142027363181114196777344f));
+    u = mad(u, t, RV(0.199926957488059997558594f));
+    u = mad(u, t, RV(-0.333331018686294555664062f));
+    break;
+  case sizeof(double):
+    u = RV(-1.88796008463073496563746e-05);
+    u = mad(u, t, RV(0.000209850076645816976906797));
+    u = mad(u, t, RV(-0.00110611831486672482563471));
+    u = mad(u, t, RV(0.00370026744188713119232403));
+    u = mad(u, t, RV(-0.00889896195887655491740809));
+    u = mad(u, t, RV(0.016599329773529201970117));
+    u = mad(u, t, RV(-0.0254517624932312641616861));
+    u = mad(u, t, RV(0.0337852580001353069993897));
+    u = mad(u, t, RV(-0.0407629191276836500001934));
+    u = mad(u, t, RV(0.0466667150077840625632675));
+    u = mad(u, t, RV(-0.0523674852303482457616113));
+    u = mad(u, t, RV(0.0587666392926673580854313));
+    u = mad(u, t, RV(-0.0666573579361080525984562));
+    u = mad(u, t, RV(0.0769219538311769618355029));
+    u = mad(u, t, RV(-0.090908995008245008229153));
+    u = mad(u, t, RV(0.111111105648261418443745));
+    u = mad(u, t, RV(-0.14285714266771329383765));
+    u = mad(u, t, RV(0.199999999996591265594148));
+    u = mad(u, t, RV(-0.333333333333311110369124));
+    break;
   }
-  
+
+  t = s + s * (t * u);
+
+  t = ifthen(q0, RV(M_PI_2) - t, t);
+  t = copysign(t, q1);
+
+  return t;
+}
+
+// Note: the order of arguments is y, x, as is convention for atan2
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_atan2(realvec_t y, realvec_t x) {
+  // Algorithm taken from SLEEF 2.80
+
+  realvec_t r = atan2k(fabs(y), x);
+
+  r = mulsign(r, x);
+
+  r = ifthen(isinf(x) || x == RV(0.0),
+             ifthen(isinf(x), RV(M_PI_2) - copysign(RV(M_PI_2), x), RV(M_PI_2)),
+             r);
+
+  r = ifthen(isinf(y),
+             ifthen(isinf(x), RV(M_PI_2) - copysign(RV(M_PI_4), x), RV(M_PI_2)),
+             r);
+
+  r = ifthen(y == RV(0.0), ifthen(signbit(x), RV(M_PI), RV(0.0)), r);
+
+  const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
+  return ifthen(isnan(x) || isnan(y), RV(nan), mulsign(r, y));
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_ASIN_H
+#endif // #ifndef MATHFUNCS_ASIN_H
diff --git a/mathfuncs_asinh.h b/mathfuncs_asinh.h
index c7be8eb..1197261 100644
--- a/mathfuncs_asinh.h
+++ b/mathfuncs_asinh.h
@@ -7,36 +7,31 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_asinh(realvec_t x) {
+  // Reduce range
+  realvec_t r = fabs(x);
+  r = log(r + sqrt(r * r + RV(1.0)));
+  r = copysign(r, x);
+  return r;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_acosh(realvec_t x) {
+  return log(x + sqrt(x * x - RV(1.0)));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_atanh(realvec_t x) {
+  // Reduce range
+  realvec_t r = fabs(x);
+  r = RV(0.5) * log((RV(1.0) + r) / (RV(1.0) - r));
+  r = copysign(r, x);
+  return r;
+}
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_asinh(realvec_t x)
-  {
-    // Reduce range
-    realvec_t r = fabs(x);
-    r = log(r + sqrt(r*r + RV(1.0)));
-    r = copysign(r, x);
-    return r;
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_acosh(realvec_t x)
-  {
-    return log(x + sqrt(x*x - RV(1.0)));
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_atanh(realvec_t x)
-  {
-    // Reduce range
-    realvec_t r = fabs(x);
-    r = RV(0.5) * log((RV(1.0) + r) / (RV(1.0) - r));
-    r = copysign(r, x);
-    return r;
-  }
-  
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_ASINH_H
+#endif // #ifndef MATHFUNCS_ASINH_H
diff --git a/mathfuncs_base.h b/mathfuncs_base.h
index c685542..8545003 100644
--- a/mathfuncs_base.h
+++ b/mathfuncs_base.h
@@ -5,130 +5,127 @@
 
 #include "floatprops.h"
 
+namespace vecmathlib {
 
+template <typename realvec_t> struct mathfuncs {
+  typedef floatprops<typename realvec_t::real_t> FP;
+
+  typedef typename FP::real_t real_t;
+  typedef typename FP::int_t int_t;
+  typedef typename FP::uint_t uint_t;
+
+  static int const size = realvec_t::size;
+
+  // typedef realvec<real_t, size> realvec_t;
+  typedef typename realvec_t::intvec_t intvec_t;
+  typedef typename realvec_t::boolvec_t boolvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  // static real_t R(double a) { return real_t(a); }
+  // static int_t I(int a) { return int_t(a); }
+  // static uint_t U(int a) { return uint_t(a); }
+  // static realvec_t RV(real_t a) { return realvec_t(a); }
+  // static intvec_t IV(int_t a) { return intvec_t(a); }
+  // static boolvec_t BV(bool a) { return boolvec_t(a); }
+
+  // int
+  static intvec_t vml_abs(intvec_t x);
+  static intvec_t vml_bitifthen(intvec_t x, intvec_t y, intvec_t z);
+  static intvec_t vml_clz(intvec_t x);
+  static boolvec_t vml_isignbit(intvec_t x);
+  static intvec_t vml_max(intvec_t x, intvec_t y);
+  static intvec_t vml_min(intvec_t x, intvec_t y);
+  static intvec_t vml_popcount(intvec_t x);
+  static intvec_t vml_rotate(intvec_t x, int_t n);
+  static intvec_t vml_rotate(intvec_t x, intvec_t n);
+
+  // asin
+  static realvec_t vml_acos(realvec_t x);
+  static realvec_t vml_asin(realvec_t x);
+  static realvec_t vml_atan(realvec_t x);
+  static realvec_t vml_atan2(realvec_t y, realvec_t x);
+
+  // asinh
+  static realvec_t vml_acosh(realvec_t x);
+  static realvec_t vml_asinh(realvec_t x);
+  static realvec_t vml_atanh(realvec_t x);
+
+  // convert
+  static realvec_t vml_antitrunc(realvec_t x);
+  static realvec_t vml_ceil(realvec_t x);
+  static realvec_t vml_convert_float(intvec_t x);
+  static intvec_t vml_convert_int(realvec_t x);
+  static realvec_t vml_floor(realvec_t x);
+  static intvec_t vml_lrint(realvec_t x);
+  static realvec_t vml_rint(realvec_t x);
+  static realvec_t vml_round(realvec_t x);
+  static realvec_t vml_nextafter(realvec_t x, realvec_t y);
+  static realvec_t vml_trunc(realvec_t x);
+
+  // fabs
+  static realvec_t vml_copysign(realvec_t x, realvec_t y);
+  static realvec_t vml_fabs(realvec_t x);
+  static realvec_t vml_fdim(realvec_t x, realvec_t y);
+  static realvec_t vml_fma(realvec_t x, realvec_t y, realvec_t z);
+  static realvec_t vml_fmax(realvec_t x, realvec_t y);
+  static realvec_t vml_fmin(realvec_t x, realvec_t y);
+  static realvec_t vml_frexp(realvec_t x, intvec_t *r);
+  static intvec_t vml_ilogb(realvec_t x);
+  static boolvec_t vml_ieee_isfinite(realvec_t x);
+  static boolvec_t vml_ieee_isinf(realvec_t x);
+  static boolvec_t vml_ieee_isnan(realvec_t x);
+  static boolvec_t vml_ieee_isnormal(realvec_t x);
+  static boolvec_t vml_isfinite(realvec_t x);
+  static boolvec_t vml_isinf(realvec_t x);
+  static boolvec_t vml_isnan(realvec_t x);
+  static boolvec_t vml_isnormal(realvec_t x);
+  static realvec_t vml_ldexp(realvec_t x, intvec_t n);
+  static realvec_t vml_mad(realvec_t x, realvec_t y, realvec_t z);
+  static boolvec_t vml_signbit(realvec_t x);
+
+  // exp
+  static realvec_t vml_exp(realvec_t x);
+  static realvec_t vml_exp10(realvec_t x);
+  static realvec_t vml_exp2(realvec_t x);
+  static realvec_t vml_expm1(realvec_t x);
+
+  // log
+  static realvec_t vml_log(realvec_t x);
+  static realvec_t vml_log10(realvec_t x);
+  static realvec_t vml_log1p(realvec_t x);
+  static realvec_t vml_log2(realvec_t x);
+
+  // pow
+  static realvec_t vml_pow(realvec_t x, realvec_t y);
+
+  // rcp
+  static realvec_t vml_fmod(realvec_t x, realvec_t y);
+  static realvec_t vml_rcp(realvec_t x);
+  static realvec_t vml_remainder(realvec_t x, realvec_t y);
+
+  // sin
+  static realvec_t vml_cos(realvec_t x);
+  static realvec_t vml_sin(realvec_t x);
+  static realvec_t vml_tan(realvec_t x);
+
+  // sinh
+  static realvec_t vml_cosh(realvec_t x);
+  static realvec_t vml_sinh(realvec_t x);
+  static realvec_t vml_tanh(realvec_t x);
+
+  // sqrt
+  static realvec_t vml_cbrt(realvec_t x);
+  static realvec_t vml_hypot(realvec_t x, realvec_t y);
+  static realvec_t vml_rsqrt(realvec_t x);
+  static realvec_t vml_sqrt(realvec_t x);
+};
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  struct mathfuncs {
-    typedef floatprops<typename realvec_t::real_t> FP;
-    
-    typedef typename FP::real_t real_t;
-    typedef typename FP::int_t int_t;
-    typedef typename FP::uint_t uint_t;
-    
-    static int const size = realvec_t::size;
-    
-    // typedef realvec<real_t, size> realvec_t;
-    typedef typename realvec_t::intvec_t intvec_t;
-    typedef typename realvec_t::boolvec_t boolvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    // static real_t R(double a) { return real_t(a); }
-    // static int_t I(int a) { return int_t(a); }
-    // static uint_t U(int a) { return uint_t(a); }
-    // static realvec_t RV(real_t a) { return realvec_t(a); }
-    // static intvec_t IV(int_t a) { return intvec_t(a); }
-    // static boolvec_t BV(bool a) { return boolvec_t(a); }
-    
-    // int
-    static intvec_t vml_abs(intvec_t x);
-    static intvec_t vml_bitifthen(intvec_t x, intvec_t y, intvec_t z);
-    static intvec_t vml_clz(intvec_t x);
-    static boolvec_t vml_isignbit(intvec_t x);
-    static intvec_t vml_max(intvec_t x, intvec_t y);
-    static intvec_t vml_min(intvec_t x, intvec_t y);
-    static intvec_t vml_popcount(intvec_t x);
-    static intvec_t vml_rotate(intvec_t x, int_t n);
-    static intvec_t vml_rotate(intvec_t x, intvec_t n);
-    
-    // asin
-    static realvec_t vml_acos(realvec_t x);
-    static realvec_t vml_asin(realvec_t x);
-    static realvec_t vml_atan(realvec_t x);
-    static realvec_t vml_atan2(realvec_t y, realvec_t x);
-    
-    // asinh
-    static realvec_t vml_acosh(realvec_t x);
-    static realvec_t vml_asinh(realvec_t x);
-    static realvec_t vml_atanh(realvec_t x);
-    
-    // convert
-    static realvec_t vml_antitrunc(realvec_t x);
-    static realvec_t vml_ceil(realvec_t x);
-    static realvec_t vml_convert_float(intvec_t x);
-    static intvec_t vml_convert_int(realvec_t x);
-    static realvec_t vml_floor(realvec_t x);
-    static intvec_t vml_lrint(realvec_t x);
-    static realvec_t vml_rint(realvec_t x);
-    static realvec_t vml_round(realvec_t x);
-    static realvec_t vml_nextafter(realvec_t x, realvec_t y);
-    static realvec_t vml_trunc(realvec_t x);
-    
-    // fabs
-    static realvec_t vml_copysign(realvec_t x, realvec_t y);
-    static realvec_t vml_fabs(realvec_t x);
-    static realvec_t vml_fdim(realvec_t x, realvec_t y);
-    static realvec_t vml_fma(realvec_t x, realvec_t y, realvec_t z);
-    static realvec_t vml_fmax(realvec_t x, realvec_t y);
-    static realvec_t vml_fmin(realvec_t x, realvec_t y);
-    static realvec_t vml_frexp(realvec_t x, intvec_t* r);
-    static intvec_t vml_ilogb(realvec_t x);
-    static boolvec_t vml_ieee_isfinite(realvec_t x);
-    static boolvec_t vml_ieee_isinf(realvec_t x);
-    static boolvec_t vml_ieee_isnan(realvec_t x);
-    static boolvec_t vml_ieee_isnormal(realvec_t x);
-    static boolvec_t vml_isfinite(realvec_t x);
-    static boolvec_t vml_isinf(realvec_t x);
-    static boolvec_t vml_isnan(realvec_t x);
-    static boolvec_t vml_isnormal(realvec_t x);
-    static realvec_t vml_ldexp(realvec_t x, intvec_t n);
-    static realvec_t vml_mad(realvec_t x, realvec_t y, realvec_t z);
-    static boolvec_t vml_signbit(realvec_t x);
-    
-    // exp
-    static realvec_t vml_exp(realvec_t x);
-    static realvec_t vml_exp10(realvec_t x);
-    static realvec_t vml_exp2(realvec_t x);
-    static realvec_t vml_expm1(realvec_t x);
-    
-    // log
-    static realvec_t vml_log(realvec_t x);
-    static realvec_t vml_log10(realvec_t x);
-    static realvec_t vml_log1p(realvec_t x);
-    static realvec_t vml_log2(realvec_t x);
-    
-    // pow
-    static realvec_t vml_pow(realvec_t x, realvec_t y);
-    
-    // rcp
-    static realvec_t vml_fmod(realvec_t x, realvec_t y);
-    static realvec_t vml_rcp(realvec_t x);
-    static realvec_t vml_remainder(realvec_t x, realvec_t y);
-    
-    // sin
-    static realvec_t vml_cos(realvec_t x);
-    static realvec_t vml_sin(realvec_t x);
-    static realvec_t vml_tan(realvec_t x);
-    
-    // sinh
-    static realvec_t vml_cosh(realvec_t x);
-    static realvec_t vml_sinh(realvec_t x);
-    static realvec_t vml_tanh(realvec_t x);
-    
-    // sqrt
-    static realvec_t vml_cbrt(realvec_t x);
-    static realvec_t vml_hypot(realvec_t x, realvec_t y);
-    static realvec_t vml_rsqrt(realvec_t x);
-    static realvec_t vml_sqrt(realvec_t x);
-  };
-  
 } // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_BASE_H
+#endif // #ifndef MATHFUNCS_BASE_H
diff --git a/mathfuncs_convert.h b/mathfuncs_convert.h
index 79befbc..9cb1add 100644
--- a/mathfuncs_convert.h
+++ b/mathfuncs_convert.h
@@ -7,197 +7,179 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_convert_float(intvec_t x) {
+  // Convert in two passes. Convert as much as possible during the
+  // first pass (lobits), so that the second pass (hibits) may be
+  // omitted if the high bits are known to be zero.
+  int_t lobits = FP::mantissa_bits;
+  // int_t hibits = FP::bits - lobits;
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_convert_float(intvec_t x)
-  {
-    // Convert in two passes. Convert as much as possible during the
-    // first pass (lobits), so that the second pass (hibits) may be
-    // omitted if the high bits are known to be zero.
-    int_t lobits = FP::mantissa_bits;
-    // int_t hibits = FP::bits - lobits;
-    
-    // Convert lower bits
-    intvec_t xlo = x & IV((U(1) << lobits) - 1);
-    // exponent for the equivalent floating point number
-    int_t exponent_lo = (FP::exponent_offset + lobits) << FP::mantissa_bits;
-    xlo |= exponent_lo;
-    // subtract hidden mantissa bit
-    realvec_t flo = as_float(xlo) - RV(FP::as_float(exponent_lo));
-    
-    // Convert upper bits
-    // make unsigned by subtracting largest negative number
-    // (only do this for the high bits, since they have sufficient
-    // precision to handle the overflow)
-    x ^= FP::signbit_mask;
-    intvec_t xhi = lsr(x, lobits);
-    // exponent for the equivalent floating point number
-    int_t exponent_hi = (FP::exponent_offset + 2*lobits) << FP::mantissa_bits;
-    xhi |= exponent_hi;
-    // subtract hidden mantissa bit
-    realvec_t fhi = as_float(xhi) - RV(FP::as_float(exponent_hi));
-    // add largest negative number again
-    fhi -= RV(R(FP::signbit_mask));
-    // Ensure that the converted low and high bits are calculated
-    // separately, since a real_t doesn't have enough precision to
-    // hold all the bits of an int_t
-    fhi.barrier();
-    
-    // Combine results
-    return flo + fhi;
-  }
-  
-  
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t
-  mathfuncs<realvec_t>::vml_convert_int(realvec_t x)
-  {
-    // Handle overflow
-    // int_t min_int = FP::signbit_mask;
-    // int_t max_int = ~FP::signbit_mask;
-    // boolvec_t is_overflow = x < RV(R(min_int)) || x > RV(R(max_int));
-    // Handle negative numbers
-    boolvec_t is_negative = signbit(x);
-    x = fabs(x);
-    // Handle small numbers
-    boolvec_t issmall = x < RV(1.0);
-    
-    intvec_t shift = ilogb(x) - IV(FP::mantissa_bits);
-    boolvec_t shift_left = x > RV(std::ldexp(R(1.0), FP::mantissa_bits)); 
-    intvec_t ix = as_int(x) & IV(FP::mantissa_mask);
-    // add hidden mantissa bit
-    ix |= U(1) << FP::mantissa_bits;
-    // shift according to exponent (which may truncate)
-    ix = ifthen(shift_left, ix << shift, ix >> -shift);
-    
-    // Handle small numbers
-    ix = ifthen(issmall, IV(I(0)), ix);
-    // Handle negative numbers
-    ix = ifthen(is_negative, -ix, ix);
-    // Handle overflow
-    // ix = ifthen(is_overflow, IV(min_int), ix);
-    
-    return ix;
-  }
-  
-  
-  
-  // Round to nearest integer, breaking ties using prevailing rounding
-  // mode (default: round to even)
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_rint(realvec_t x)
-  {
-    realvec_t r = x;
-    // Round by adding a large number, destroying all excess precision
-    realvec_t offset = copysign(RV(std::ldexp(R(1.0), FP::mantissa_bits)), x);
-    r += offset;
-    // Ensure the rounding is not optimised away
-    r.barrier();
-    r -= offset;
-    return r;
-  }
-  
-  // Round to next integer above
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_ceil(realvec_t x)
-  {
-    // boolvec_t iszero = x == RV(0.0);
-    // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits));
-    // return ifthen(iszero, x, rint(x + offset));
-    return ifthen(x<RV(0.0), trunc(x), vml_antitrunc(x));
-  }
-  
-  // Round to next integer below
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_floor(realvec_t x)
-  {
-    // boolvec_t iszero = x == RV(0.0);
-    // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits));
-    // return ifthen(iszero, x, rint(x - offset));
-    return ifthen(x<RV(0.0), vml_antitrunc(x), trunc(x));
-  }
-  
-  // Round to nearest integer, breaking ties using prevailing rounding
-  // mode (default: round to even), returning an integer
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_lrint(realvec_t x)
-  {
-    return convert_int(rint(x));
-  }
-  
-  // Round to nearest integer, breaking ties away from zero
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_round(realvec_t x)
-  {
-    // return copysign(floor(fabs(x)+RV(0.5)), x);
-    return trunc(x + copysign(RV(0.5), x));
-  }
-  
-  // Round to next integer towards zero
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_trunc(realvec_t x)
-  {
-    realvec_t x0 = x;
-    x = fabs(x);
-    boolvec_t istoosmall = x < RV(1.0);
-    boolvec_t istoolarge = x >= RV(std::ldexp(R(1.0), FP::mantissa_bits));
-    // Number of mantissa bits to keep
-    intvec_t nbits = ilogb(x);
-    // This is probably faster than a shift operation
-    realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0);
-    intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask);
-    realvec_t y = as_float(as_int(x) & imask);
-    realvec_t r =
+  // Convert lower bits
+  intvec_t xlo = x & IV((U(1) << lobits) - 1);
+  // exponent for the equivalent floating point number
+  int_t exponent_lo = (FP::exponent_offset + lobits) << FP::mantissa_bits;
+  xlo |= exponent_lo;
+  // subtract hidden mantissa bit
+  realvec_t flo = as_float(xlo) - RV(FP::as_float(exponent_lo));
+
+  // Convert upper bits
+  // make unsigned by subtracting largest negative number
+  // (only do this for the high bits, since they have sufficient
+  // precision to handle the overflow)
+  x ^= FP::signbit_mask;
+  intvec_t xhi = lsr(x, lobits);
+  // exponent for the equivalent floating point number
+  int_t exponent_hi = (FP::exponent_offset + 2 * lobits) << FP::mantissa_bits;
+  xhi |= exponent_hi;
+  // subtract hidden mantissa bit
+  realvec_t fhi = as_float(xhi) - RV(FP::as_float(exponent_hi));
+  // add largest negative number again
+  fhi -= RV(R(FP::signbit_mask));
+  // Ensure that the converted low and high bits are calculated
+  // separately, since a real_t doesn't have enough precision to
+  // hold all the bits of an int_t
+  fhi.barrier();
+
+  // Combine results
+  return flo + fhi;
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t
+mathfuncs<realvec_t>::vml_convert_int(realvec_t x) {
+  // Handle overflow
+  // int_t min_int = FP::signbit_mask;
+  // int_t max_int = ~FP::signbit_mask;
+  // boolvec_t is_overflow = x < RV(R(min_int)) || x > RV(R(max_int));
+  // Handle negative numbers
+  boolvec_t is_negative = signbit(x);
+  x = fabs(x);
+  // Handle small numbers
+  boolvec_t issmall = x < RV(1.0);
+
+  intvec_t shift = ilogb(x) - IV(FP::mantissa_bits);
+  boolvec_t shift_left = x > RV(std::ldexp(R(1.0), FP::mantissa_bits));
+  intvec_t ix = as_int(x) & IV(FP::mantissa_mask);
+  // add hidden mantissa bit
+  ix |= U(1) << FP::mantissa_bits;
+  // shift according to exponent (which may truncate)
+  ix = ifthen(shift_left, ix << shift, ix >> -shift);
+
+  // Handle small numbers
+  ix = ifthen(issmall, IV(I(0)), ix);
+  // Handle negative numbers
+  ix = ifthen(is_negative, -ix, ix);
+  // Handle overflow
+  // ix = ifthen(is_overflow, IV(min_int), ix);
+
+  return ix;
+}
+
+// Round to nearest integer, breaking ties using prevailing rounding
+// mode (default: round to even)
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_rint(realvec_t x) {
+  realvec_t r = x;
+  // Round by adding a large number, destroying all excess precision
+  realvec_t offset = copysign(RV(std::ldexp(R(1.0), FP::mantissa_bits)), x);
+  r += offset;
+  // Ensure the rounding is not optimised away
+  r.barrier();
+  r -= offset;
+  return r;
+}
+
+// Round to next integer above
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_ceil(realvec_t x) {
+  // boolvec_t iszero = x == RV(0.0);
+  // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits));
+  // return ifthen(iszero, x, rint(x + offset));
+  return ifthen(x < RV(0.0), trunc(x), vml_antitrunc(x));
+}
+
+// Round to next integer below
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_floor(realvec_t x) {
+  // boolvec_t iszero = x == RV(0.0);
+  // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits));
+  // return ifthen(iszero, x, rint(x - offset));
+  return ifthen(x < RV(0.0), vml_antitrunc(x), trunc(x));
+}
+
+// Round to nearest integer, breaking ties using prevailing rounding
+// mode (default: round to even), returning an integer
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_lrint(realvec_t x) {
+  return convert_int(rint(x));
+}
+
+// Round to nearest integer, breaking ties away from zero
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_round(realvec_t x) {
+  // return copysign(floor(fabs(x)+RV(0.5)), x);
+  return trunc(x + copysign(RV(0.5), x));
+}
+
+// Round to next integer towards zero
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_trunc(realvec_t x) {
+  realvec_t x0 = x;
+  x = fabs(x);
+  boolvec_t istoosmall = x < RV(1.0);
+  boolvec_t istoolarge = x >= RV(std::ldexp(R(1.0), FP::mantissa_bits));
+  // Number of mantissa bits to keep
+  intvec_t nbits = ilogb(x);
+  // This is probably faster than a shift operation
+  realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0);
+  intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask);
+  realvec_t y = as_float(as_int(x) & imask);
+  realvec_t r =
       copysign(ifthen(istoosmall, RV(0.0), ifthen(istoolarge, x, y)), x0);
-    return r;
-  }
-  
-  // Round to next integer away from zero
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_antitrunc(realvec_t x)
-  {
-    realvec_t x0 = x;
-    x = fabs(x);
-    boolvec_t iszero = x == RV(0.0);
-    boolvec_t issmall = x <= RV(1.0);
-    boolvec_t istoolarge =
-      x > RV(std::ldexp(R(1.0), FP::mantissa_bits) - R(1.0));
-    // Number of mantissa bits to keep
-    intvec_t nbits = ilogb(x);
-    // This is probably faster than a shift operation
-    realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0);
-    intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask);
-    realvec_t offset = RV(1.0) - ldexp(RV(1.0), nbits - IV(FP::mantissa_bits));
-    offset.barrier();
-    realvec_t y = as_float(as_int(x + offset) & imask);
-    realvec_t r =
+  return r;
+}
+
+// Round to next integer away from zero
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_antitrunc(realvec_t x) {
+  realvec_t x0 = x;
+  x = fabs(x);
+  boolvec_t iszero = x == RV(0.0);
+  boolvec_t issmall = x <= RV(1.0);
+  boolvec_t istoolarge = x > RV(std::ldexp(R(1.0), FP::mantissa_bits) - R(1.0));
+  // Number of mantissa bits to keep
+  intvec_t nbits = ilogb(x);
+  // This is probably faster than a shift operation
+  realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0);
+  intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask);
+  realvec_t offset = RV(1.0) - ldexp(RV(1.0), nbits - IV(FP::mantissa_bits));
+  offset.barrier();
+  realvec_t y = as_float(as_int(x + offset) & imask);
+  realvec_t r =
       copysign(ifthen(iszero, RV(0.0),
-                      ifthen(issmall, RV(1.0),
-                             ifthen(istoolarge, x, y))), x0);
-    return r;
-  }
-  
-  // Next machine representable number from x in direction y
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_nextafter(realvec_t x, realvec_t y)
-  {
-    realvec_t dir = y - x;
-    realvec_t offset = ldexp(RV(FP::epsilon()), ilogb(x));
-    offset = copysign(offset, dir);
-    offset = ifthen(convert_bool(as_int(x) & IV(FP::mantissa_mask)) ||
-                    signbit(x) == signbit(offset),
-                    offset,
-                    offset * RV(0.5));
-    realvec_t r = x + offset;
-    real_t smallest_pos = std::ldexp(FP::min(), -FP::mantissa_bits);
-    return ifthen(dir==RV(0.0), y,
-                  ifthen(x==RV(0.0), copysign(RV(smallest_pos), dir), r));
-  }
-  
+                      ifthen(issmall, RV(1.0), ifthen(istoolarge, x, y))),
+               x0);
+  return r;
+}
+
+// Next machine representable number from x in direction y
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_nextafter(realvec_t x, realvec_t y) {
+  realvec_t dir = y - x;
+  realvec_t offset = ldexp(RV(FP::epsilon()), ilogb(x));
+  offset = copysign(offset, dir);
+  offset = ifthen(convert_bool(as_int(x) & IV(FP::mantissa_mask)) ||
+                      signbit(x) == signbit(offset),
+                  offset, offset * RV(0.5));
+  realvec_t r = x + offset;
+  real_t smallest_pos = std::ldexp(FP::min(), -FP::mantissa_bits);
+  return ifthen(dir == RV(0.0), y,
+                ifthen(x == RV(0.0), copysign(RV(smallest_pos), dir), r));
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_CONVERT_H
+#endif // #ifndef MATHFUNCS_CONVERT_H
diff --git a/mathfuncs_exp.h b/mathfuncs_exp.h
index d357a21..e35fb1b 100644
--- a/mathfuncs_exp.h
+++ b/mathfuncs_exp.h
@@ -7,156 +7,145 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_exp2(realvec_t x) {
+  // TODO: Check SLEEF 2.80 algorithm
+  // (in particular the improved-precision truncation)
+
+  // Rescale
+  realvec_t x0 = x;
+
+// realvec_t round_x = rint(x);
+// intvec_t iround_x = convert_int(round_x);
+// r = ldexp(r, iround_x);
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_exp2(realvec_t x)
-  {
-    // TODO: Check SLEEF 2.80 algorithm
-    // (in particular the improved-precision truncation)
-    
-    // Rescale
-    realvec_t x0 = x;
-    
-    // realvec_t round_x = rint(x);
-    // intvec_t iround_x = convert_int(round_x);
-    // r = ldexp(r, iround_x);
-    
 #if 0
     // Straightforward implementation
     realvec_t round_x = rint(x);
     x -= round_x;
 #elif 1
-    // Round by adding, then subtracting again a large number
-    // Add a large number to move the mantissa bits to the right
-    int_t large = (U(1) << FP::mantissa_bits) + FP::exponent_offset;
-    realvec_t tmp = x + RV(R(large));
-    tmp.barrier();
-    
-    realvec_t round_x = tmp - RV(R(large));
-    x -= round_x;
+  // Round by adding, then subtracting again a large number
+  // Add a large number to move the mantissa bits to the right
+  int_t large = (U(1) << FP::mantissa_bits) + FP::exponent_offset;
+  realvec_t tmp = x + RV(R(large));
+  tmp.barrier();
+
+  realvec_t round_x = tmp - RV(R(large));
+  x -= round_x;
 #else
-    // Straightforward implementation, using round instead of rint,
-    // since round is faster for QPX
-    realvec_t round_x = round(x);
-    x -= round_x;
+  // Straightforward implementation, using round instead of rint,
+  // since round is faster for QPX
+  realvec_t round_x = round(x);
+  x -= round_x;
 #endif
-    VML_ASSERT(all(x >= RV(-0.5) && x <= RV(0.5)));
-    
-    // Polynomial expansion
-    realvec_t r;
-    switch (sizeof(real_t)) {
-    case 4:
+  VML_ASSERT(all(x >= RV(-0.5) && x <= RV(0.5)));
+
+  // Polynomial expansion
+  realvec_t r;
+  switch (sizeof(real_t)) {
+  case 4:
 #ifdef VML_HAVE_FP_CONTRACT
-      // float, error=4.55549108005200277750378992345e-9
-      r = RV(0.000154653240842602623787395880898);
-      r = mad(r, x, RV(0.00133952915439234389712105060319));
-      r = mad(r, x, RV(0.0096180399118156827664944870552));
-      r = mad(r, x, RV(0.055503406540531310853149866446));
-      r = mad(r, x, RV(0.240226511015459465468737123346));
-      r = mad(r, x, RV(0.69314720007380208630542805293));
-      r = mad(r, x, RV(0.99999999997182023878745628977));
+    // float, error=4.55549108005200277750378992345e-9
+    r = RV(0.000154653240842602623787395880898);
+    r = mad(r, x, RV(0.00133952915439234389712105060319));
+    r = mad(r, x, RV(0.0096180399118156827664944870552));
+    r = mad(r, x, RV(0.055503406540531310853149866446));
+    r = mad(r, x, RV(0.240226511015459465468737123346));
+    r = mad(r, x, RV(0.69314720007380208630542805293));
+    r = mad(r, x, RV(0.99999999997182023878745628977));
 #else
-      // float, error=1.62772721960621336664735896836e-7
-      r = RV(0.00133952915439234389712105060319);
-      r = mad(r, x, RV(0.009670773148229417605024318985));
-      r = mad(r, x, RV(0.055503406540531310853149866446));
-      r = mad(r, x, RV(0.240222115700585316818177639177));
-      r = mad(r, x, RV(0.69314720007380208630542805293));
-      r = mad(r, x, RV(1.00000005230745711373079206024));
+    // float, error=1.62772721960621336664735896836e-7
+    r = RV(0.00133952915439234389712105060319);
+    r = mad(r, x, RV(0.009670773148229417605024318985));
+    r = mad(r, x, RV(0.055503406540531310853149866446));
+    r = mad(r, x, RV(0.240222115700585316818177639177));
+    r = mad(r, x, RV(0.69314720007380208630542805293));
+    r = mad(r, x, RV(1.00000005230745711373079206024));
 #endif
-      break;
-    case 8:
+    break;
+  case 8:
 #ifdef VML_HAVE_FP_CONTRACT
-      // double, error=9.32016781355638010975628074746e-18
-      r = RV(4.45623165388261696886670014471e-10);
-      r = mad(r, x, RV(7.0733589360775271430968224806e-9));
-      r = mad(r, x, RV(1.01780540270960163558119510246e-7));
-      r = mad(r, x, RV(1.3215437348041505269462510712e-6));
-      r = mad(r, x, RV(0.000015252733849766201174247690629));
-      r = mad(r, x, RV(0.000154035304541242555115696403795));
-      r = mad(r, x, RV(0.00133335581463968601407096905671));
-      r = mad(r, x, RV(0.0096181291075949686712855561931));
-      r = mad(r, x, RV(0.055504108664821672870565883052));
-      r = mad(r, x, RV(0.240226506959101382690753994082));
-      r = mad(r, x, RV(0.69314718055994530864272481773));
-      r = mad(r, x, RV(0.9999999999999999978508676375));
+    // double, error=9.32016781355638010975628074746e-18
+    r = RV(4.45623165388261696886670014471e-10);
+    r = mad(r, x, RV(7.0733589360775271430968224806e-9));
+    r = mad(r, x, RV(1.01780540270960163558119510246e-7));
+    r = mad(r, x, RV(1.3215437348041505269462510712e-6));
+    r = mad(r, x, RV(0.000015252733849766201174247690629));
+    r = mad(r, x, RV(0.000154035304541242555115696403795));
+    r = mad(r, x, RV(0.00133335581463968601407096905671));
+    r = mad(r, x, RV(0.0096181291075949686712855561931));
+    r = mad(r, x, RV(0.055504108664821672870565883052));
+    r = mad(r, x, RV(0.240226506959101382690753994082));
+    r = mad(r, x, RV(0.69314718055994530864272481773));
+    r = mad(r, x, RV(0.9999999999999999978508676375));
 #else
-      // double, error=3.74939899823302048807873981077e-14
-      r = RV(1.02072375599725694063203809188e-7);
-      r = mad(r, x, RV(1.32573274434801314145133004073e-6));
-      r = mad(r, x, RV(0.0000152526647170731944840736190013));
-      r = mad(r, x, RV(0.000154034441925859828261898614555));
-      r = mad(r, x, RV(0.00133335582175770747495287552557));
-      r = mad(r, x, RV(0.0096181291794939392517233403183));
-      r = mad(r, x, RV(0.055504108664525029438908798685));
-      r = mad(r, x, RV(0.240226506957026959772247598695));
-      r = mad(r, x, RV(0.6931471805599487321347668143));
-      r = mad(r, x, RV(1.00000000000000942892870993489));
+    // double, error=3.74939899823302048807873981077e-14
+    r = RV(1.02072375599725694063203809188e-7);
+    r = mad(r, x, RV(1.32573274434801314145133004073e-6));
+    r = mad(r, x, RV(0.0000152526647170731944840736190013));
+    r = mad(r, x, RV(0.000154034441925859828261898614555));
+    r = mad(r, x, RV(0.00133335582175770747495287552557));
+    r = mad(r, x, RV(0.0096181291794939392517233403183));
+    r = mad(r, x, RV(0.055504108664525029438908798685));
+    r = mad(r, x, RV(0.240226506957026959772247598695));
+    r = mad(r, x, RV(0.6931471805599487321347668143));
+    r = mad(r, x, RV(1.00000000000000942892870993489));
 #endif
-      break;
-    default:
-      __builtin_unreachable();
-    }
-    
-    // Undo rescaling
+    break;
+  default:
+    __builtin_unreachable();
+  }
+
+// Undo rescaling
 #if 0
     // Straightforward implementation
     r = ldexp(r, convert_int(round_x));
 #elif 1
-    // Use direct integer manipulation
-    // Extract integer as lowest mantissa bits (highest bits still
-    // contain offset, exponent, and sign)
-    intvec_t itmp = as_int(tmp);
-    // Construct scale factor by setting exponent (this shifts out the
-    // highest bits)
-    realvec_t scale = as_float(itmp << I(FP::mantissa_bits));
-    r *= scale;
+  // Use direct integer manipulation
+  // Extract integer as lowest mantissa bits (highest bits still
+  // contain offset, exponent, and sign)
+  intvec_t itmp = as_int(tmp);
+  // Construct scale factor by setting exponent (this shifts out the
+  // highest bits)
+  realvec_t scale = as_float(itmp << I(FP::mantissa_bits));
+  r *= scale;
 #else
-    // Use floating point operations instead of integer operations,
-    // since these are faster for QPX
-    real_t exponent_factor = R(I(1) << I(FP::mantissa_bits));
-    real_t exponent_offset = R(I(FP::exponent_offset) << I(FP::mantissa_bits));
-    realvec_t exponent = mad(round_x, RV(exponent_factor), RV(exponent_offset));
-    realvec_t scale = as_float(convert_int(exponent));
-    r *= scale;
+  // Use floating point operations instead of integer operations,
+  // since these are faster for QPX
+  real_t exponent_factor = R(I(1) << I(FP::mantissa_bits));
+  real_t exponent_offset = R(I(FP::exponent_offset) << I(FP::mantissa_bits));
+  realvec_t exponent = mad(round_x, RV(exponent_factor), RV(exponent_offset));
+  realvec_t scale = as_float(convert_int(exponent));
+  r *= scale;
 #endif
-    
-    r = ifthen(x0 < RV(R(FP::min_exponent)), RV(0.0), r);
-    
-    return r;
-  }
-  
-  
-  
-  template<typename realvec_t>
-  inline
-  realvec_t mathfuncs<realvec_t>::vml_exp(realvec_t x)
-  {
-    return exp2(RV(M_LOG2E) * x);
-  }
 
-  template<typename realvec_t>
-  inline
-  realvec_t mathfuncs<realvec_t>::vml_exp10(realvec_t x)
-  {
-    return exp2(RV(M_LOG2E * M_LN10) * x);
-  }
+  r = ifthen(x0 < RV(R(FP::min_exponent)), RV(0.0), r);
+
+  return r;
+}
 
-  template<typename realvec_t>
-  inline
-  realvec_t mathfuncs<realvec_t>::vml_expm1(realvec_t x)
-  {
-    // TODO: improve this
-    return exp(x) - RV(1.0);
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_exp(realvec_t x) {
+  return exp2(RV(M_LOG2E) * x);
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_exp10(realvec_t x) {
+  return exp2(RV(M_LOG2E * M_LN10) * x);
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_expm1(realvec_t x) {
+  // TODO: improve this
+  return exp(x) - RV(1.0);
 #if 0
     r = exp(x) - RV(1.0);
     return ifthen(r == RV(0.0), x, r);
 #endif
-  }
-  
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_EXP_H
+#endif // #ifndef MATHFUNCS_EXP_H
diff --git a/mathfuncs_fabs.h b/mathfuncs_fabs.h
index 4f31dec..c3f7356 100644
--- a/mathfuncs_fabs.h
+++ b/mathfuncs_fabs.h
@@ -7,201 +7,176 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y) {
+  intvec_t value = as_int(x) & IV(U(~FP::signbit_mask));
+  intvec_t sign = as_int(y) & IV(FP::signbit_mask);
+  return as_float(sign | value);
+}
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y)
-  {
-    intvec_t value = as_int(x) & IV(U(~FP::signbit_mask));
-    intvec_t sign = as_int(y) & IV(FP::signbit_mask);
-    return as_float(sign | value);
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x)
-  {
-    return as_float(as_int(x) & IV(U(~FP::signbit_mask)));
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_fdim(realvec_t x, realvec_t y)
-  {
-    // return ifthen(x > y, x - y, RV(0.0));
-    return fmax(x - y, RV(0.0));
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_fma(realvec_t x, realvec_t y, realvec_t z)
-  {
-    return x * y + z;
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_fmax(realvec_t x, realvec_t y)
-  {
-    return ifthen(x < y, y, x);
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_fmin(realvec_t x, realvec_t y)
-  {
-    return ifthen(y < x, y, x);
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_frexp(realvec_t x,
-                                            typename realvec_t::intvec_t* irp)
-  {
-    intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
-    intvec_t ir = e - IV(FP::exponent_offset - 1);
-    ir = ifthen(convert_bool(e), ir, IV(std::numeric_limits<int_t>::min()));
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x) {
+  return as_float(as_int(x) & IV(U(~FP::signbit_mask)));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fdim(realvec_t x, realvec_t y) {
+  // return ifthen(x > y, x - y, RV(0.0));
+  return fmax(x - y, RV(0.0));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fma(realvec_t x, realvec_t y, realvec_t z) {
+  return x * y + z;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fmax(realvec_t x, realvec_t y) {
+  return ifthen(x < y, y, x);
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fmin(realvec_t x, realvec_t y) {
+  return ifthen(y < x, y, x);
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_frexp(realvec_t x,
+                                          typename realvec_t::intvec_t *irp) {
+  intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
+  intvec_t ir = e - IV(FP::exponent_offset - 1);
+  ir = ifthen(convert_bool(e), ir, IV(std::numeric_limits<int_t>::min()));
 #if defined VML_HAVE_INF
-    ir = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), ir);
+  ir = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), ir);
 #endif
 #if defined VML_HAVE_NAN
-    ir = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), ir);
+  ir = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), ir);
 #endif
-    realvec_t r =
+  realvec_t r =
       as_float((as_int(x) & IV(FP::signbit_mask | FP::mantissa_mask)) |
                IV(FP::as_int(R(0.5)) & FP::exponent_mask));
-    boolvec_t iszero = x == RV(0.0);
-    ir = ifthen(iszero, IV(I(0)), ir);
-    r = ifthen(iszero, copysign(RV(R(0.0)), r), r);
-    *irp = ir;
-    return r;
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_ilogb(realvec_t x)
-  {
-    // TODO: Check SLEEF 2.80 algorithm
-    intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
-    intvec_t r = e - IV(FP::exponent_offset);
-    r = ifthen(convert_bool(e), r, IV(std::numeric_limits<int_t>::min()));
+  boolvec_t iszero = x == RV(0.0);
+  ir = ifthen(iszero, IV(I(0)), ir);
+  r = ifthen(iszero, copysign(RV(R(0.0)), r), r);
+  *irp = ir;
+  return r;
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_ilogb(realvec_t x) {
+  // TODO: Check SLEEF 2.80 algorithm
+  intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
+  intvec_t r = e - IV(FP::exponent_offset);
+  r = ifthen(convert_bool(e), r, IV(std::numeric_limits<int_t>::min()));
 #if defined VML_HAVE_INF
-    r = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), r);
+  r = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), r);
 #endif
 #if defined VML_HAVE_NAN
-    r = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), r);
+  r = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), r);
 #endif
-    return r;
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t
-  mathfuncs<realvec_t>::vml_ieee_isfinite(realvec_t x)
-  {
-    return (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t
-  mathfuncs<realvec_t>::vml_ieee_isinf(realvec_t x)
-  {
-    return (as_int(x) & IV(I(~FP::signbit_mask))) == IV(FP::exponent_mask);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t
-  mathfuncs<realvec_t>::vml_ieee_isnan(realvec_t x)
-  {
-    return
-      (as_int(x) & IV(FP::exponent_mask)) == IV(FP::exponent_mask) &&
-      (as_int(x) & IV(FP::mantissa_mask)) != IV(I(0));
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t
-  mathfuncs<realvec_t>::vml_ieee_isnormal(realvec_t x)
-  {
-    return
-      (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask) &&
-      (as_int(x) & IV(FP::exponent_mask)) != IV(I(0));
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t
-  mathfuncs<realvec_t>::vml_isfinite(realvec_t x)
-  {
+  return r;
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t
+mathfuncs<realvec_t>::vml_ieee_isfinite(realvec_t x) {
+  return (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask);
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t
+mathfuncs<realvec_t>::vml_ieee_isinf(realvec_t x) {
+  return (as_int(x) & IV(I(~FP::signbit_mask))) == IV(FP::exponent_mask);
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t
+mathfuncs<realvec_t>::vml_ieee_isnan(realvec_t x) {
+  return (as_int(x) & IV(FP::exponent_mask)) == IV(FP::exponent_mask) &&
+         (as_int(x) & IV(FP::mantissa_mask)) != IV(I(0));
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t
+mathfuncs<realvec_t>::vml_ieee_isnormal(realvec_t x) {
+  return (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask) &&
+         (as_int(x) & IV(FP::exponent_mask)) != IV(I(0));
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isfinite(realvec_t x) {
 #if defined VML_HAVE_INF || defined VML_HAVE_NAN
-    return vml_ieee_isfinite(x);
+  return vml_ieee_isfinite(x);
 #else
-    return BV(true);
+  return BV(true);
 #endif
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isinf(realvec_t x)
-  {
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isinf(realvec_t x) {
 #if defined VML_HAVE_INF
-    return vml_ieee_isinf(x);
+  return vml_ieee_isinf(x);
 #else
-    return BV(false);
+  return BV(false);
 #endif
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnan(realvec_t x)
-  {
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnan(realvec_t x) {
 #if defined VML_HAVE_NAN
-    return vml_ieee_isnan(x);
+  return vml_ieee_isnan(x);
 #else
-    return BV(false);
+  return BV(false);
 #endif
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnormal(realvec_t x)
-  {
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnormal(realvec_t x) {
 #if defined VML_HAVE_DENORMALS || defined VML_HAVE_INF || defined VML_HAVE_NAN
-    return vml_ieee_isnormal(x);
+  return vml_ieee_isnormal(x);
 #else
-    return BV(true);
+  return BV(true);
 #endif
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_ldexp(realvec_t x, intvec_t n)
-  {
-    // TODO: Check SLEEF 2.80 algorithm
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_ldexp(realvec_t x, intvec_t n) {
+// TODO: Check SLEEF 2.80 algorithm
 #if 0
     realvec_t r = as_float(as_int(x) + (n << I(FP::mantissa_bits)));
     r = ifthen((as_int(x) & IV(FP::exponent_mask)) == IV(I(0)), x, r);
     return r;
 #endif
-    realvec_t r = as_float(as_int(x) + (n << U(FP::mantissa_bits)));
-    int max_n = FP::max_exponent - FP::min_exponent;
-    boolvec_t underflow = n < IV(I(-max_n));
-    boolvec_t overflow = n > IV(I(max_n));
-    intvec_t old_exp =
-      lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
-    intvec_t new_exp = old_exp + n;
-    // TODO: check bit patterns instead
-    underflow =
+  realvec_t r = as_float(as_int(x) + (n << U(FP::mantissa_bits)));
+  int max_n = FP::max_exponent - FP::min_exponent;
+  boolvec_t underflow = n < IV(I(-max_n));
+  boolvec_t overflow = n > IV(I(max_n));
+  intvec_t old_exp = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
+  intvec_t new_exp = old_exp + n;
+  // TODO: check bit patterns instead
+  underflow =
       underflow || new_exp < IV(I(FP::min_exponent + FP::exponent_offset));
-    overflow =
+  overflow =
       overflow || new_exp > IV(I(FP::max_exponent + FP::exponent_offset));
-    r = ifthen(underflow, copysign(RV(R(0.0)), x), r);
-    r = ifthen(overflow, copysign(RV(FP::infinity()), x), r);
-    boolvec_t dont_change = x == RV(R(0.0)) || isinf(x) || isnan(x);
-    r = ifthen(dont_change, x, r);
-    return r;
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_mad(realvec_t x, realvec_t y, realvec_t z)
-  {
-    return x * y + z;
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_signbit(realvec_t x)
-  {
-    return convert_bool(as_int(x) & IV(FP::signbit_mask));
-  }
-  
+  r = ifthen(underflow, copysign(RV(R(0.0)), x), r);
+  r = ifthen(overflow, copysign(RV(FP::infinity()), x), r);
+  boolvec_t dont_change = x == RV(R(0.0)) || isinf(x) || isnan(x);
+  r = ifthen(dont_change, x, r);
+  return r;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_mad(realvec_t x, realvec_t y, realvec_t z) {
+  return x * y + z;
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_signbit(realvec_t x) {
+  return convert_bool(as_int(x) & IV(FP::signbit_mask));
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_FABS_H
+#endif // #ifndef MATHFUNCS_FABS_H
diff --git a/mathfuncs_int.h b/mathfuncs_int.h
index 862189d..fff65ff 100644
--- a/mathfuncs_int.h
+++ b/mathfuncs_int.h
@@ -7,129 +7,128 @@
 
 #include <climits>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_abs(intvec_t x) {
+  return ifthen(isignbit(x), -x, x);
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t
+mathfuncs<realvec_t>::vml_bitifthen(intvec_t x, intvec_t y, intvec_t z) {
+  return (x & y) | (~x & z);
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_clz(intvec_t x) {
+  // These implementations return 8*sizeof(TYPE) when the input is 0
+
+  // These explicit implementations are taken from
+  // <http://aggregate.org/MAGIC/>:
+  //
+  // @techreport{magicalgorithms,
+  //   author={Henry Gordon Dietz},
+  //   title={{The Aggregate Magic Algorithms}},
+  //   institution={University of Kentucky},
+  //   howpublished={Aggregate.Org online technical report},
+  //   date={2013-03-25},
+  //   URL={http://aggregate.org/MAGIC/}
+  // }
+
+  int_t bits = CHAR_BIT * sizeof(int_t);
+  if (bits > 1)
+    x |= lsr(x, 1);
+  if (bits > 2)
+    x |= lsr(x, 2);
+  if (bits > 4)
+    x |= lsr(x, 4);
+  if (bits > 8)
+    x |= lsr(x, 8);
+  if (bits > 16)
+    x |= lsr(x, 16);
+  if (bits > 32)
+    x |= lsr(x, 32);
+  if (bits > 64)
+    x |= lsr(x, 64);
+  assert(bits <= 128);
+  return IV(I(bits)) - popcount(x);
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isignbit(intvec_t x) {
+  return x < IV(I(0));
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_max(intvec_t x,
+                                                           intvec_t y) {
+  return ifthen(x >= y, x, y);
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_min(intvec_t x,
+                                                           intvec_t y) {
+  return ifthen(x < y, x, y);
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_popcount(intvec_t x) {
+  // These explicit implementations are taken from
+  // <http://aggregate.org/MAGIC/>:
+  //
+  // @techreport{magicalgorithms,
+  //   author={Henry Gordon Dietz},
+  //   title={{The Aggregate Magic Algorithms}},
+  //   institution={University of Kentucky},
+  //   howpublished={Aggregate.Org online technical report},
+  //   date={2013-03-25},
+  //   URL={http://aggregate.org/MAGIC/}
+  // }
+
+  int_t bits = CHAR_BIT * sizeof(int_t);
+
+  // intvec_t x55 = IV(FP::replicate_byte(0x55));
+  // intvec_t x33 = IV(FP::replicate_byte(0x33));
+  // intvec_t x0f = IV(FP::replicate_byte(0x0f));
+  intvec_t x55 = I(~U(0) / U(3));  // 0x0101...
+  intvec_t x33 = I(~U(0) / U(5));  // 0x00110011...
+  intvec_t x0f = I(~U(0) / U(17)); // 0b0000111100001111...
+
+  x -= lsr(x, I(1)) & x55;
+  x = (x & x33) + (lsr(x, I(2)) & x33);
+  x += lsr(x, I(4));
+  x &= x0f;
+  if (bits > 8)
+    x += lsr(x, I(8));
+  if (bits > 16)
+    x += lsr(x, I(16));
+  if (bits > 32)
+    x += lsr(x, I(32));
+  if (bits > 64)
+    x += lsr(x, I(64));
+  assert(bits <= 128);
+  return x & IV(I(0xff));
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x,
+                                                              int_t n) {
+  int_t mask = CHAR_BIT * sizeof(int_t) - 1;
+  intvec_t left = x << (n & mask);
+  intvec_t right = lsr(x, -n & mask);
+  return left | right;
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x,
+                                                              intvec_t n) {
+  intvec_t mask = IV(I(CHAR_BIT * sizeof(int_t) - 1));
+  intvec_t left = x << (n & mask);
+  intvec_t right = lsr(x, -n & mask);
+  return left | right;
+}
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_abs(intvec_t x)
-  {
-    return ifthen(isignbit(x), -x, x);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_bitifthen(intvec_t x,
-                                                                   intvec_t y,
-                                                                   intvec_t z)
-  {
-    return (x & y) | (~x & z);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_clz(intvec_t x)
-  {
-    // These implementations return 8*sizeof(TYPE) when the input is 0
-    
-    // These explicit implementations are taken from
-    // <http://aggregate.org/MAGIC/>:
-    // 
-    // @techreport{magicalgorithms,
-    //   author={Henry Gordon Dietz},
-    //   title={{The Aggregate Magic Algorithms}},
-    //   institution={University of Kentucky},
-    //   howpublished={Aggregate.Org online technical report},
-    //   date={2013-03-25},
-    //   URL={http://aggregate.org/MAGIC/}
-    // }
-    
-    int_t bits = CHAR_BIT * sizeof(int_t);
-    if (bits >  1) x |= lsr(x,  1);
-    if (bits >  2) x |= lsr(x,  2);
-    if (bits >  4) x |= lsr(x,  4);
-    if (bits >  8) x |= lsr(x,  8);
-    if (bits > 16) x |= lsr(x, 16);
-    if (bits > 32) x |= lsr(x, 32);
-    if (bits > 64) x |= lsr(x, 64);
-    assert(bits<=128);
-    return IV(I(bits)) - popcount(x);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isignbit(intvec_t x)
-  {
-    return x < IV(I(0));
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_max(intvec_t x,
-                                                             intvec_t y)
-  {
-    return ifthen(x>=y, x, y);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_min(intvec_t x,
-                                                             intvec_t y)
-  {
-    return ifthen(x<y, x, y);
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_popcount(intvec_t x)
-  {
-    // These explicit implementations are taken from
-    // <http://aggregate.org/MAGIC/>:
-    // 
-    // @techreport{magicalgorithms,
-    //   author={Henry Gordon Dietz},
-    //   title={{The Aggregate Magic Algorithms}},
-    //   institution={University of Kentucky},
-    //   howpublished={Aggregate.Org online technical report},
-    //   date={2013-03-25},
-    //   URL={http://aggregate.org/MAGIC/}
-    // }
-    
-    int_t bits = CHAR_BIT * sizeof(int_t);
-    
-    // intvec_t x55 = IV(FP::replicate_byte(0x55));
-    // intvec_t x33 = IV(FP::replicate_byte(0x33));
-    // intvec_t x0f = IV(FP::replicate_byte(0x0f));
-    intvec_t x55 = I(~U(0) /  U(3)); // 0x0101...
-    intvec_t x33 = I(~U(0) /  U(5)); // 0x00110011...
-    intvec_t x0f = I(~U(0) / U(17)); // 0b0000111100001111...
-    
-    x -= lsr(x, I(1)) & x55;
-    x = (x & x33) + (lsr(x, I(2)) & x33);
-    x += lsr(x, I(4));
-    x &= x0f;
-    if (bits >  8) x += lsr(x,  I(8));
-    if (bits > 16) x += lsr(x, I(16));
-    if (bits > 32) x += lsr(x, I(32));
-    if (bits > 64) x += lsr(x, I(64));
-    assert(bits<=128);
-    return x & IV(I(0xff));
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x,
-                                                                int_t n)
-  {
-    int_t mask = CHAR_BIT * sizeof(int_t) - 1;
-    intvec_t left = x << (n & mask);
-    intvec_t right = lsr(x, -n & mask);
-    return left | right;
-  }
-  
-  template<typename realvec_t>
-  typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x,
-                                                                intvec_t n)
-  {
-    intvec_t mask = IV(I(CHAR_BIT * sizeof(int_t) - 1));
-    intvec_t left = x << (n & mask);
-    intvec_t right = lsr(x, -n & mask);
-    return left | right;
-  }
-  
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_ASIN_H
+#endif // #ifndef MATHFUNCS_ASIN_H
diff --git a/mathfuncs_log.h b/mathfuncs_log.h
index cd71eb3..fa517ba 100644
--- a/mathfuncs_log.h
+++ b/mathfuncs_log.h
@@ -7,93 +7,82 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_log2(realvec_t x) {
+  // Algorithm inspired by SLEEF 2.80
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_log2(realvec_t x)
-  {
-    // Algorithm inspired by SLEEF 2.80
-    
-    // Rescale
-    intvec_t ilogb_x = ilogb(x * RV(M_SQRT2));
-    x = ldexp(x, -ilogb_x);
-    VML_ASSERT(all(x >= RV(M_SQRT1_2) && x <= RV(M_SQRT2)));
-    
-    realvec_t y = (x - RV(1.0)) / (x + RV(1.0));
-    realvec_t y2 = y*y;
-    
-    realvec_t r;
-    switch (sizeof(real_t)) {
-    case 4:
-      // float, error=7.09807175879142775648452461821e-8
-      r = RV(0.59723611417135718739797302426);
-      r = mad(r, y2, RV(0.961524413175528426101613434));
-      r = mad(r, y2, RV(2.88539097665498228703236701));
-      break;
-    case 8:
+  // Rescale
+  intvec_t ilogb_x = ilogb(x * RV(M_SQRT2));
+  x = ldexp(x, -ilogb_x);
+  VML_ASSERT(all(x >= RV(M_SQRT1_2) && x <= RV(M_SQRT2)));
+
+  realvec_t y = (x - RV(1.0)) / (x + RV(1.0));
+  realvec_t y2 = y * y;
+
+  realvec_t r;
+  switch (sizeof(real_t)) {
+  case 4:
+    // float, error=7.09807175879142775648452461821e-8
+    r = RV(0.59723611417135718739797302426);
+    r = mad(r, y2, RV(0.961524413175528426101613434));
+    r = mad(r, y2, RV(2.88539097665498228703236701));
+    break;
+  case 8:
 #ifdef VML_HAVE_FP_CONTRACT
-      // double, error=1.48294180185938512675770096324e-16
-      r = RV(0.243683403415639178527756320773);
-      r = mad(r, y2, RV(0.26136626803870009948502658));
-      r = mad(r, y2, RV(0.320619429891299265439389));
-      r = mad(r, y2, RV(0.4121983452028499242926));
-      r = mad(r, y2, RV(0.577078017761894161436));
-      r = mad(r, y2, RV(0.96179669392233355927));
-      r = mad(r, y2, RV(2.8853900817779295236));
+    // double, error=1.48294180185938512675770096324e-16
+    r = RV(0.243683403415639178527756320773);
+    r = mad(r, y2, RV(0.26136626803870009948502658));
+    r = mad(r, y2, RV(0.320619429891299265439389));
+    r = mad(r, y2, RV(0.4121983452028499242926));
+    r = mad(r, y2, RV(0.577078017761894161436));
+    r = mad(r, y2, RV(0.96179669392233355927));
+    r = mad(r, y2, RV(2.8853900817779295236));
 #else
-      // double, error=2.1410114030383689267772704676e-14
-      r = RV(0.283751646449323373643963474845);
-      r = mad(r, y2, RV(0.31983138095551191299118812));
-      r = mad(r, y2, RV(0.412211603844146279666022));
-      r = mad(r, y2, RV(0.5770779098948940070516));
-      r = mad(r, y2, RV(0.961796694295973716912));
-      r = mad(r, y2, RV(2.885390081777562819196));
+    // double, error=2.1410114030383689267772704676e-14
+    r = RV(0.283751646449323373643963474845);
+    r = mad(r, y2, RV(0.31983138095551191299118812));
+    r = mad(r, y2, RV(0.412211603844146279666022));
+    r = mad(r, y2, RV(0.5770779098948940070516));
+    r = mad(r, y2, RV(0.961796694295973716912));
+    r = mad(r, y2, RV(2.885390081777562819196));
 #endif
-      break;
-    default:
-      __builtin_unreachable();
-    }
-    r *= y;
-    
-    // Undo rescaling
-    r += convert_float(ilogb_x);
-    
-    return r;
-  }
-  
-  
-  
-  template<typename realvec_t>
-  inline
-  realvec_t mathfuncs<realvec_t>::vml_log(realvec_t x)
-  {
-    return log2(x) * RV(M_LN2);
+    break;
+  default:
+    __builtin_unreachable();
   }
+  r *= y;
 
-  template<typename realvec_t>
-  inline
-  realvec_t mathfuncs<realvec_t>::vml_log10(realvec_t x)
-  {
-    return log(x) * RV(M_LOG10E);
-  }
+  // Undo rescaling
+  r += convert_float(ilogb_x);
 
-  template<typename realvec_t>
-  inline
-  realvec_t mathfuncs<realvec_t>::vml_log1p(realvec_t x)
-  {
-    // TODO: Check SLEEF 2.80 algorithm
-    
-    return log(RV(1.0) + x);
+  return r;
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_log(realvec_t x) {
+  return log2(x) * RV(M_LN2);
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_log10(realvec_t x) {
+  return log(x) * RV(M_LOG10E);
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_log1p(realvec_t x) {
+  // TODO: Check SLEEF 2.80 algorithm
+
+  return log(RV(1.0) + x);
 #if 0
     // Goldberg, theorem 4
     realvec_t x1 = RV(1.0) + x;
     x1.barrier();
     return ifthen(x1 == x, x, x * log(x1) / (x1 - RV(1.0)));
 #endif
-  }
-  
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_LOG_H
+#endif // #ifndef MATHFUNCS_LOG_H
diff --git a/mathfuncs_pow.h b/mathfuncs_pow.h
index b863570..70bcc80 100644
--- a/mathfuncs_pow.h
+++ b/mathfuncs_pow.h
@@ -7,30 +7,27 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_pow(realvec_t x, realvec_t y) {
+  // Handle zero
+  boolvec_t is_zero = x == RV(0.0);
+  x = ifthen(is_zero, RV(1.0), x);
+
+  realvec_t r = exp(log(fabs(x)) * y);
+
+  // The result is negative if x<0 and if y is integer and odd
+  realvec_t mod_y = fabs(y) - RV(2.0) * floor(RV(0.5) * fabs(y));
+  realvec_t sign = copysign(mod_y, x) + RV(0.5);
+  r = copysign(r, sign);
+
+  // Handle zero
+  r = ifthen(is_zero, RV(0.0), r);
+
+  return r;
+}
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_pow(realvec_t x, realvec_t y)
-  {
-    // Handle zero
-    boolvec_t is_zero = x == RV(0.0);
-    x = ifthen(is_zero, RV(1.0), x);
-    
-    realvec_t r = exp(log(fabs(x)) * y);
-    
-    // The result is negative if x<0 and if y is integer and odd
-    realvec_t mod_y = fabs(y) - RV(2.0) * floor(RV(0.5) * fabs(y));
-    realvec_t sign = copysign(mod_y, x) + RV(0.5);
-    r = copysign(r, sign);
-    
-    // Handle zero
-    r = ifthen(is_zero, RV(0.0), r);
-    
-    return r;
-  }
-  
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_POW_H
+#endif // #ifndef MATHFUNCS_POW_H
diff --git a/mathfuncs_rcp.h b/mathfuncs_rcp.h
index 6e12b27..f703454 100644
--- a/mathfuncs_rcp.h
+++ b/mathfuncs_rcp.h
@@ -7,10 +7,8 @@
 
 #include <cmath>
 
-
-
 namespace vecmathlib {
-  
+
 #if 0
   // This routine works, but may be slower than the one below
   template<typename realvec_t>
@@ -50,66 +48,61 @@ namespace vecmathlib {
     return r;
   }
 #endif
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_rcp(realvec_t x)
-  {
-    // Handle negative values
-    realvec_t x0 = x;
-    x = fabs(x);
-    
-    // <https://en.wikipedia.org/wiki/Division_algorithm> [2013-06-28]
-    
-    // Initial guess
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_rcp(realvec_t x) {
+  // Handle negative values
+  realvec_t x0 = x;
+  x = fabs(x);
+
+  // <https://en.wikipedia.org/wiki/Division_algorithm> [2013-06-28]
+
+  // Initial guess
+  VML_ASSERT(all(x > RV(0.0)));
+  intvec_t x_exp;
+  x = frexp(x, &x_exp);
+  VML_ASSERT(all(x >= RV(0.5) && x < RV(1.0)));
+  realvec_t r = RV(R(48.0) / R(17.0)) - RV(R(32.0) / R(17.0)) * x;
+
+  // Iterate
+  int const nmax = sizeof(real_t) == 4 ? 3 : 4;
+  for (int n = 0; n < nmax; ++n) {
+    // Step
     VML_ASSERT(all(x > RV(0.0)));
-    intvec_t x_exp;
-    x = frexp(x, &x_exp);
-    VML_ASSERT(all(x >= RV(0.5) && x < RV(1.0)));
-    realvec_t r = RV(R(48.0)/R(17.0)) - RV(R(32.0)/R(17.0)) * x;
-    
-    // Iterate
-    int const nmax = sizeof(real_t)==4 ? 3 : 4;
-    for (int n=0; n<nmax; ++n) {
-      // Step
-      VML_ASSERT(all(x > RV(0.0)));
-      // Newton method:
-      // Solve   f(r) = 0   for   f(r) = x - 1/r
-      //    r <- r - f(r) / f'(r)
-      //    r <- 2 r - r^2 x
-      //    r <- r + r (1 - r x)
-      
-      // Note: don't rewrite this expression, this may introduce
-      // cancellation errors
-      r += r * (RV(1.0) - x*r);
-      
-      // NEON: r = r * (RV(2.0) - x*r);
-    }
-    r = ldexp(r, -x_exp);
-    
-    // Handle negative values
-    r = copysign(r, x0);
-    
-    return r;
-  }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_remainder(realvec_t x, realvec_t y)
-  {
-    return x - rint(x / y) * y;
-    // realvec_t r = x / y;
-    // return y * (r - rint(r));
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_fmod(realvec_t x, realvec_t y)
-  {
-    return x - y * trunc(x / y);
-    // realvec_t r = x / y;
-    // return y * (r - trunc(r));
+    // Newton method:
+    // Solve   f(r) = 0   for   f(r) = x - 1/r
+    //    r <- r - f(r) / f'(r)
+    //    r <- 2 r - r^2 x
+    //    r <- r + r (1 - r x)
+
+    // Note: don't rewrite this expression, this may introduce
+    // cancellation errors
+    r += r * (RV(1.0) - x * r);
+
+    // NEON: r = r * (RV(2.0) - x*r);
   }
-  
+  r = ldexp(r, -x_exp);
+
+  // Handle negative values
+  r = copysign(r, x0);
+
+  return r;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_remainder(realvec_t x, realvec_t y) {
+  return x - rint(x / y) * y;
+  // realvec_t r = x / y;
+  // return y * (r - rint(r));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fmod(realvec_t x, realvec_t y) {
+  return x - y * trunc(x / y);
+  // realvec_t r = x / y;
+  // return y * (r - trunc(r));
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_RCP_H
+#endif // #ifndef MATHFUNCS_RCP_H
diff --git a/mathfuncs_sin.h b/mathfuncs_sin.h
index 8e2afd9..72ffb6f 100644
--- a/mathfuncs_sin.h
+++ b/mathfuncs_sin.h
@@ -7,230 +7,227 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_sin(realvec_t d) {
+  // Algorithm taken from SLEEF 2.80
+
+  real_t PI4_A, PI4_B, PI4_C, PI4_D;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    PI4_A = 0.78515625f;
+    PI4_B = 0.00024187564849853515625f;
+    PI4_C = 3.7747668102383613586e-08f;
+    PI4_D = 1.2816720341285448015e-12f;
+    break;
+  case sizeof(double):
+    PI4_A = 0.78539816290140151978;
+    PI4_B = 4.9604678871439933374e-10;
+    PI4_C = 1.1258708853173288931e-18;
+    PI4_D = 1.7607799325916000908e-27;
+    break;
+  }
+
+  realvec_t q = rint(d * RV(M_1_PI));
+  intvec_t iq = convert_int(q);
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_sin(realvec_t d)
-  {
-    // Algorithm taken from SLEEF 2.80
-    
-    real_t PI4_A, PI4_B, PI4_C, PI4_D;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      PI4_A = 0.78515625f;
-      PI4_B = 0.00024187564849853515625f;
-      PI4_C = 3.7747668102383613586e-08f;
-      PI4_D = 1.2816720341285448015e-12f;
-      break;
-    case sizeof(double):
-      PI4_A = 0.78539816290140151978;
-      PI4_B = 4.9604678871439933374e-10;
-      PI4_C = 1.1258708853173288931e-18;
-      PI4_D = 1.7607799325916000908e-27;
-      break;
-    }
-    
-    realvec_t q = rint(d * RV(M_1_PI));
-    intvec_t iq = convert_int(q);
-    
 #ifdef VML_HAVE_FP_CONTRACT
-    d = mad(q, RV(-PI4_A*4), d);
-    d = mad(q, RV(-PI4_B*4), d);
-    d = mad(q, RV(-PI4_C*4), d);
-    d = mad(q, RV(-PI4_D*4), d);
+  d = mad(q, RV(-PI4_A * 4), d);
+  d = mad(q, RV(-PI4_B * 4), d);
+  d = mad(q, RV(-PI4_C * 4), d);
+  d = mad(q, RV(-PI4_D * 4), d);
 #else
-    d = mad(q, RV(-M_PI), d);
+  d = mad(q, RV(-M_PI), d);
 #endif
-    
-    realvec_t s = d * d;
-    
-    d = ifthen(convert_bool(iq & IV(I(1))), -d, d);
-    
-    realvec_t u;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      u = RV(2.6083159809786593541503e-06f);
-      u = mad(u, s, RV(-0.0001981069071916863322258f));
-      u = mad(u, s, RV(0.00833307858556509017944336f));
-      u = mad(u, s, RV(-0.166666597127914428710938f));
-      break;
-    case sizeof(double):
-      u = RV(-7.97255955009037868891952e-18);
-      u = mad(u, s, RV(2.81009972710863200091251e-15));
-      u = mad(u, s, RV(-7.64712219118158833288484e-13));
-      u = mad(u, s, RV(1.60590430605664501629054e-10));
-      u = mad(u, s, RV(-2.50521083763502045810755e-08));
-      u = mad(u, s, RV(2.75573192239198747630416e-06));
-      u = mad(u, s, RV(-0.000198412698412696162806809));
-      u = mad(u, s, RV(0.00833333333333332974823815));
-      u = mad(u, s, RV(-0.166666666666666657414808));
-      break;
-    }
-    
-    u = mad(s, u * d, d);
-    
-    const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
-    u = ifthen(isinf(d), RV(nan), u);
-    
-    return u;
+
+  realvec_t s = d * d;
+
+  d = ifthen(convert_bool(iq & IV(I(1))), -d, d);
+
+  realvec_t u;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    u = RV(2.6083159809786593541503e-06f);
+    u = mad(u, s, RV(-0.0001981069071916863322258f));
+    u = mad(u, s, RV(0.00833307858556509017944336f));
+    u = mad(u, s, RV(-0.166666597127914428710938f));
+    break;
+  case sizeof(double):
+    u = RV(-7.97255955009037868891952e-18);
+    u = mad(u, s, RV(2.81009972710863200091251e-15));
+    u = mad(u, s, RV(-7.64712219118158833288484e-13));
+    u = mad(u, s, RV(1.60590430605664501629054e-10));
+    u = mad(u, s, RV(-2.50521083763502045810755e-08));
+    u = mad(u, s, RV(2.75573192239198747630416e-06));
+    u = mad(u, s, RV(-0.000198412698412696162806809));
+    u = mad(u, s, RV(0.00833333333333332974823815));
+    u = mad(u, s, RV(-0.166666666666666657414808));
+    break;
+  }
+
+  u = mad(s, u * d, d);
+
+  const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
+  u = ifthen(isinf(d), RV(nan), u);
+
+  return u;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_cos(realvec_t d) {
+  // Algorithm taken from SLEEF 2.80
+
+  real_t PI4_A, PI4_B, PI4_C, PI4_D;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    PI4_A = 0.78515625f;
+    PI4_B = 0.00024187564849853515625f;
+    PI4_C = 3.7747668102383613586e-08f;
+    PI4_D = 1.2816720341285448015e-12f;
+    break;
+  case sizeof(double):
+    PI4_A = 0.78539816290140151978;
+    PI4_B = 4.9604678871439933374e-10;
+    PI4_C = 1.1258708853173288931e-18;
+    PI4_D = 1.7607799325916000908e-27;
+    break;
   }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_cos(realvec_t d)
-  {
-    // Algorithm taken from SLEEF 2.80
-    
-    real_t PI4_A, PI4_B, PI4_C, PI4_D;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      PI4_A = 0.78515625f;
-      PI4_B = 0.00024187564849853515625f;
-      PI4_C = 3.7747668102383613586e-08f;
-      PI4_D = 1.2816720341285448015e-12f;
-      break;
-    case sizeof(double):
-      PI4_A = 0.78539816290140151978;
-      PI4_B = 4.9604678871439933374e-10;
-      PI4_C = 1.1258708853173288931e-18;
-      PI4_D = 1.7607799325916000908e-27;
-      break;
-    }
-    
-    realvec_t q = mad(RV(2.0), rint(mad(d, RV(M_1_PI), RV(-0.5))), RV(1.0));
-    intvec_t iq = convert_int(q);
-    
+
+  realvec_t q = mad(RV(2.0), rint(mad(d, RV(M_1_PI), RV(-0.5))), RV(1.0));
+  intvec_t iq = convert_int(q);
+
 #ifdef VML_HAVE_FP_CONTRACT
-    d = mad(q, RV(-PI4_A*2), d);
-    d = mad(q, RV(-PI4_B*2), d);
-    d = mad(q, RV(-PI4_C*2), d);
-    d = mad(q, RV(-PI4_D*2), d);
+  d = mad(q, RV(-PI4_A * 2), d);
+  d = mad(q, RV(-PI4_B * 2), d);
+  d = mad(q, RV(-PI4_C * 2), d);
+  d = mad(q, RV(-PI4_D * 2), d);
 #else
-    d = mad(q, RV(-M_PI_2), d);
+  d = mad(q, RV(-M_PI_2), d);
 #endif
-    
-    realvec_t s = d * d;
-    
-    d = ifthen(convert_bool(iq & IV(I(2))), d, -d);
-    
-    realvec_t u;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      u = RV(2.6083159809786593541503e-06f);
-      u = mad(u, s, RV(-0.0001981069071916863322258f));
-      u = mad(u, s, RV(0.00833307858556509017944336f));
-      u = mad(u, s, RV(-0.166666597127914428710938f));
-      break;
-    case sizeof(double):
-      u = RV(-7.97255955009037868891952e-18);
-      u = mad(u, s, RV(2.81009972710863200091251e-15));
-      u = mad(u, s, RV(-7.64712219118158833288484e-13));
-      u = mad(u, s, RV(1.60590430605664501629054e-10));
-      u = mad(u, s, RV(-2.50521083763502045810755e-08));
-      u = mad(u, s, RV(2.75573192239198747630416e-06));
-      u = mad(u, s, RV(-0.000198412698412696162806809));
-      u = mad(u, s, RV(0.00833333333333332974823815));
-      u = mad(u, s, RV(-0.166666666666666657414808));
-      break;
-    }
-    
-    u = mad(s, u * d, d);
-    
-    const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
-    u = ifthen(isinf(d), RV(nan), u);
-    
-    return u;
+
+  realvec_t s = d * d;
+
+  d = ifthen(convert_bool(iq & IV(I(2))), d, -d);
+
+  realvec_t u;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    u = RV(2.6083159809786593541503e-06f);
+    u = mad(u, s, RV(-0.0001981069071916863322258f));
+    u = mad(u, s, RV(0.00833307858556509017944336f));
+    u = mad(u, s, RV(-0.166666597127914428710938f));
+    break;
+  case sizeof(double):
+    u = RV(-7.97255955009037868891952e-18);
+    u = mad(u, s, RV(2.81009972710863200091251e-15));
+    u = mad(u, s, RV(-7.64712219118158833288484e-13));
+    u = mad(u, s, RV(1.60590430605664501629054e-10));
+    u = mad(u, s, RV(-2.50521083763502045810755e-08));
+    u = mad(u, s, RV(2.75573192239198747630416e-06));
+    u = mad(u, s, RV(-0.000198412698412696162806809));
+    u = mad(u, s, RV(0.00833333333333332974823815));
+    u = mad(u, s, RV(-0.166666666666666657414808));
+    break;
   }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_tan(realvec_t d)
-  {
-    // Algorithm taken from SLEEF 2.80
-    
-    real_t PI4_A, PI4_B, PI4_C, PI4_D;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      PI4_A = 0.78515625f;
-      PI4_B = 0.00024187564849853515625f;
-      PI4_C = 3.7747668102383613586e-08f;
-      PI4_D = 1.2816720341285448015e-12f;
-      break;
-    case sizeof(double):
-      PI4_A = 0.78539816290140151978;
-      PI4_B = 4.9604678871439933374e-10;
-      PI4_C = 1.1258708853173288931e-18;
-      PI4_D = 1.7607799325916000908e-27;
-      break;
-    }
-    
-    realvec_t q = rint(d * RV(2 * M_1_PI));
-    intvec_t iq = convert_int(q);
-    
-    realvec_t x = d;
-    
+
+  u = mad(s, u * d, d);
+
+  const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
+  u = ifthen(isinf(d), RV(nan), u);
+
+  return u;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_tan(realvec_t d) {
+  // Algorithm taken from SLEEF 2.80
+
+  real_t PI4_A, PI4_B, PI4_C, PI4_D;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    PI4_A = 0.78515625f;
+    PI4_B = 0.00024187564849853515625f;
+    PI4_C = 3.7747668102383613586e-08f;
+    PI4_D = 1.2816720341285448015e-12f;
+    break;
+  case sizeof(double):
+    PI4_A = 0.78539816290140151978;
+    PI4_B = 4.9604678871439933374e-10;
+    PI4_C = 1.1258708853173288931e-18;
+    PI4_D = 1.7607799325916000908e-27;
+    break;
+  }
+
+  realvec_t q = rint(d * RV(2 * M_1_PI));
+  intvec_t iq = convert_int(q);
+
+  realvec_t x = d;
+
 #ifdef VML_HAVE_FP_CONTRACT
-    x = mad(q, RV(-PI4_A*2), x);
-    x = mad(q, RV(-PI4_B*2), x);
-    x = mad(q, RV(-PI4_C*2), x);
-    x = mad(q, RV(-PI4_D*2), x);
+  x = mad(q, RV(-PI4_A * 2), x);
+  x = mad(q, RV(-PI4_B * 2), x);
+  x = mad(q, RV(-PI4_C * 2), x);
+  x = mad(q, RV(-PI4_D * 2), x);
 #else
-    x = mad(q, RV(-M_PI_2), x);
+  x = mad(q, RV(-M_PI_2), x);
 #endif
-    
-    realvec_t s = x * x;
-    
-    x = ifthen(convert_bool(iq & IV(I(1))), -x, x);
-    
-    realvec_t u;
-    switch (sizeof(real_t)) {
-    default: __builtin_unreachable();
-    case sizeof(float):
-      u = RV(0.00927245803177356719970703f);
-      u = mad(u, s, RV(0.00331984995864331722259521f));
-      u = mad(u, s, RV(0.0242998078465461730957031f));
-      u = mad(u, s, RV(0.0534495301544666290283203f));
-      u = mad(u, s, RV(0.133383005857467651367188f));
-      u = mad(u, s, RV(0.333331853151321411132812f));
-      break;
-    case sizeof(double):
-      u = RV(1.01419718511083373224408e-05);
-      u = mad(u, s, RV(-2.59519791585924697698614e-05));
-      u = mad(u, s, RV(5.23388081915899855325186e-05));
-      u = mad(u, s, RV(-3.05033014433946488225616e-05));
-      u = mad(u, s, RV(7.14707504084242744267497e-05));
-      u = mad(u, s, RV(8.09674518280159187045078e-05));
-      u = mad(u, s, RV(0.000244884931879331847054404));
-      u = mad(u, s, RV(0.000588505168743587154904506));
-      u = mad(u, s, RV(0.00145612788922812427978848));
-      u = mad(u, s, RV(0.00359208743836906619142924));
-      u = mad(u, s, RV(0.00886323944362401618113356));
-      u = mad(u, s, RV(0.0218694882853846389592078));
-      u = mad(u, s, RV(0.0539682539781298417636002));
-      u = mad(u, s, RV(0.133333333333125941821962));
-      u = mad(u, s, RV(0.333333333333334980164153));
-      break;
-    }
-    
-    u = mad(s, u * x, x);
-    
-    u = ifthen(convert_bool(iq & IV(I(1))), rcp(u), u);
-    
-    const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
-    u = ifthen(isinf(d), RV(nan), u);
-    
-    return u;
+
+  realvec_t s = x * x;
+
+  x = ifthen(convert_bool(iq & IV(I(1))), -x, x);
+
+  realvec_t u;
+  switch (sizeof(real_t)) {
+  default:
+    __builtin_unreachable();
+  case sizeof(float):
+    u = RV(0.00927245803177356719970703f);
+    u = mad(u, s, RV(0.00331984995864331722259521f));
+    u = mad(u, s, RV(0.0242998078465461730957031f));
+    u = mad(u, s, RV(0.0534495301544666290283203f));
+    u = mad(u, s, RV(0.133383005857467651367188f));
+    u = mad(u, s, RV(0.333331853151321411132812f));
+    break;
+  case sizeof(double):
+    u = RV(1.01419718511083373224408e-05);
+    u = mad(u, s, RV(-2.59519791585924697698614e-05));
+    u = mad(u, s, RV(5.23388081915899855325186e-05));
+    u = mad(u, s, RV(-3.05033014433946488225616e-05));
+    u = mad(u, s, RV(7.14707504084242744267497e-05));
+    u = mad(u, s, RV(8.09674518280159187045078e-05));
+    u = mad(u, s, RV(0.000244884931879331847054404));
+    u = mad(u, s, RV(0.000588505168743587154904506));
+    u = mad(u, s, RV(0.00145612788922812427978848));
+    u = mad(u, s, RV(0.00359208743836906619142924));
+    u = mad(u, s, RV(0.00886323944362401618113356));
+    u = mad(u, s, RV(0.0218694882853846389592078));
+    u = mad(u, s, RV(0.0539682539781298417636002));
+    u = mad(u, s, RV(0.133333333333125941821962));
+    u = mad(u, s, RV(0.333333333333334980164153));
+    break;
   }
-  
+
+  u = mad(s, u * x, x);
+
+  u = ifthen(convert_bool(iq & IV(I(1))), rcp(u), u);
+
+  const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
+  u = ifthen(isinf(d), RV(nan), u);
+
+  return u;
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_SIN_H
+#endif // #ifndef MATHFUNCS_SIN_H
diff --git a/mathfuncs_sinh.h b/mathfuncs_sinh.h
index 04aa446..a8c2ee3 100644
--- a/mathfuncs_sinh.h
+++ b/mathfuncs_sinh.h
@@ -7,28 +7,23 @@
 
 #include <cmath>
 
+namespace vecmathlib {
 
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_cosh(realvec_t x) {
+  return RV(0.5) * (exp(x) + exp(-x));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_sinh(realvec_t x) {
+  return RV(0.5) * (exp(x) - exp(-x));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_tanh(realvec_t x) {
+  return sinh(x) / cosh(x);
+}
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_cosh(realvec_t x)
-  {
-    return RV(0.5) * (exp(x) + exp(-x));
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_sinh(realvec_t x)
-  {
-    return RV(0.5) * (exp(x) - exp(-x));
-  }
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_tanh(realvec_t x)
-  {
-    return sinh(x) / cosh(x);
-  }
-  
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_SINH_H
+#endif // #ifndef MATHFUNCS_SINH_H
diff --git a/mathfuncs_sqrt.h b/mathfuncs_sqrt.h
index dea5fd6..7a362f9 100644
--- a/mathfuncs_sqrt.h
+++ b/mathfuncs_sqrt.h
@@ -7,13 +7,10 @@
 
 #include <cmath>
 
-
-
 namespace vecmathlib {
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x)
-  {
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x) {
 #if 0
     // Handle special case: zero
     boolvec_t is_zero = x <= RV(0.0);
@@ -49,29 +46,23 @@ namespace vecmathlib {
     // Handle special case: zero
     r = ifthen(is_zero, RV(0.0), r);
 #endif
-    
-    realvec_t r = x * rsqrt(x);
-    // Handle special case: zero
-    r = ifthen(x == RV(0.0), RV(0.0), r);
-    
-    return r;
-  }
-  
-  
-  
-  // TODO: Use "Halley's method with cubic convergence":
-  // <http://press.mcs.anl.gov/gswjanuary12/files/2012/01/Optimizing-Single-Node-Performance-on-BlueGene.pdf>
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_cbrt(realvec_t x)
-  {
-    return pow(x, RV(1.0/3.0));
-  }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x)
-  {
+
+  realvec_t r = x * rsqrt(x);
+  // Handle special case: zero
+  r = ifthen(x == RV(0.0), RV(0.0), r);
+
+  return r;
+}
+
+// TODO: Use "Halley's method with cubic convergence":
+// <http://press.mcs.anl.gov/gswjanuary12/files/2012/01/Optimizing-Single-Node-Performance-on-BlueGene.pdf>
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_cbrt(realvec_t x) {
+  return pow(x, RV(1.0 / 3.0));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x) {
 #if 0
     // See <http://en.wikipedia.org/wiki/Fast_inverse_square_root>
     realvec_t x_2 = RV(0.5) * x;
@@ -85,46 +76,43 @@ namespace vecmathlib {
     r += r * (RV(0.5) - (x_2 * r * r));
     return r;
 #else
-    // Initial guess
-    // VML_ASSERT(all(x > RV(0.0)));
-    intvec_t ilogb_x = ilogb(x);
-    realvec_t s =
+  // Initial guess
+  // VML_ASSERT(all(x > RV(0.0)));
+  intvec_t ilogb_x = ilogb(x);
+  realvec_t s =
       ifthen(convert_bool(ilogb_x & IV(I(1))), RV(R(0.583)), RV(R(0.824)));
-    realvec_t r = ldexp(s, -(ilogb_x >> I(1)));
-    
-    realvec_t x_2 = RV(0.5) * x;
-    
-    // Iterate
-    // nmax iterations give an accuracy of 2^nmax binary digits. 5
-    // iterations suffice for double precision with its 53 digits.
-    int const nmax = sizeof(real_t)==4 ? 4 : 5;
-    for (int n=0; n<nmax; ++n) {
-      // Step
-      VML_ASSERT(all(r > RV(0.0)));
-      // Newton method:
-      // Solve   f(r) = 0   for   f(r) = x - 1/r^2
-      //    r <- r - f(r) / f'(r)
-      //    r <- (3 r - r^3 x) / 2
-      //    r <- r (3/2 - r^2 x/2)
-      
-      // Note: don't rewrite this expression, this may introduce
-      // cancellation errors (says who?)
-      // r *= RV(1.5) - x_2 * r*r;
-      r += r * (RV(0.5) - x_2 * r*r);
-    }
-    
-    return r;
-#endif
-  }
-  
-  
-  
-  template<typename realvec_t>
-  realvec_t mathfuncs<realvec_t>::vml_hypot(realvec_t x, realvec_t y)
-  {
-    return sqrt(x*x + y*y);
+  realvec_t r = ldexp(s, -(ilogb_x >> I(1)));
+
+  realvec_t x_2 = RV(0.5) * x;
+
+  // Iterate
+  // nmax iterations give an accuracy of 2^nmax binary digits. 5
+  // iterations suffice for double precision with its 53 digits.
+  int const nmax = sizeof(real_t) == 4 ? 4 : 5;
+  for (int n = 0; n < nmax; ++n) {
+    // Step
+    VML_ASSERT(all(r > RV(0.0)));
+    // Newton method:
+    // Solve   f(r) = 0   for   f(r) = x - 1/r^2
+    //    r <- r - f(r) / f'(r)
+    //    r <- (3 r - r^3 x) / 2
+    //    r <- r (3/2 - r^2 x/2)
+
+    // Note: don't rewrite this expression, this may introduce
+    // cancellation errors (says who?)
+    // r *= RV(1.5) - x_2 * r*r;
+    r += r * (RV(0.5) - x_2 * r * r);
   }
-  
+
+  return r;
+#endif
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_hypot(realvec_t x, realvec_t y) {
+  return sqrt(x * x + y * y);
+}
+
 }; // namespace vecmathlib
 
-#endif  // #ifndef MATHFUNCS_SQRT_H
+#endif // #ifndef MATHFUNCS_SQRT_H
diff --git a/selftest.cc b/selftest.cc
index 4296f14..334d95f 100644
--- a/selftest.cc
+++ b/selftest.cc
@@ -14,22 +14,17 @@
 
 using namespace std;
 
-
-  
 int num_errors = 0;
 
+template <typename realvec_t> struct vecmathlib_test {
 
-
-template<typename realvec_t>
-struct vecmathlib_test {
-  
   typedef typename realvec_t::boolvec_t boolvec_t;
   typedef typename realvec_t::intvec_t intvec_t;
-  
+
   typedef typename realvec_t::int_t int_t;
   typedef typename realvec_t::uint_t uint_t;
   typedef typename realvec_t::real_t real_t;
-  
+
   // Short names for type casts
   typedef real_t R;
   typedef int_t I;
@@ -37,16 +32,13 @@ struct vecmathlib_test {
   typedef realvec_t RV;
   typedef intvec_t IV;
   typedef boolvec_t BV;
-  
+
   typedef vecmathlib::floatprops<real_t> FP;
   typedef vecmathlib::mathfuncs<realvec_t> MF;
-  
-  
-  
+
   // Test each function with this many random values
   static const int imax = 10000;
-  static real_t accuracy(real_t ulp = R(0.5))
-  {
+  static real_t accuracy(real_t ulp = R(0.5)) {
 #ifdef VML_HAVE_FP_CONTRACT
     // Require that 100% of the digits are correct
     // real_t digit_fraction = 1.0;
@@ -56,526 +48,451 @@ struct vecmathlib_test {
     // Require that 80% of the digits are correct
     real_t digit_fraction = 0.8;
 #endif
-    digit_fraction *= 0.95;     // some lenience for testing (why?)
+    digit_fraction *= 0.95; // some lenience for testing (why?)
     return pow(ulp * realvec_t::epsilon(), digit_fraction);
   }
-  
-  
-  
-  static realvec_t random(const real_t xmin, const real_t xmax)
-  {
+
+  static realvec_t random(const real_t xmin, const real_t xmax) {
     realvec_t x;
-    for (int i=0; i<realvec_t::size; ++i) {
-      const real_t r =
-        (xmax - xmin) * FP::convert_float(rand()) / FP::convert_float(RAND_MAX);
+    for (int i = 0; i < realvec_t::size; ++i) {
+      const real_t r = (xmax - xmin) * FP::convert_float(rand()) /
+                       FP::convert_float(RAND_MAX);
       x.set_elt(i, xmin + r);
     }
     return x;
   }
-  
-  static intvec_t random(const int_t nmin, const int_t nmax)
-  {
+
+  static intvec_t random(const int_t nmin, const int_t nmax) {
     intvec_t n;
-    for (int i=0; i<intvec_t::size; ++i) {
-      const real_t r =
-        R(nmax - nmin + 1) * R(rand()) / (R(RAND_MAX) + R(1.0));
+    for (int i = 0; i < intvec_t::size; ++i) {
+      const real_t r = R(nmax - nmin + 1) * R(rand()) / (R(RAND_MAX) + R(1.0));
       n.set_elt(i, nmin + FP::convert_int(floor(r)));
     }
     return n;
   }
-  
-  
-  
-  static bool is_big_endian()
-  {
+
+  static bool is_big_endian() {
     const int i = 1;
     unsigned char cs[sizeof i];
     memcpy(cs, &i, sizeof i);
-    return cs[0]==0;
+    return cs[0] == 0;
   }
-  
-  template<typename T>
-  static string hex(const T x)
-  {
+
+  template <typename T> static string hex(const T x) {
     unsigned char cs[sizeof x];
     memcpy(cs, &x, sizeof x);
     ostringstream buf;
     buf << "0x";
-    const char* const hexdigits = "0123456789abcdef";
+    const char *const hexdigits = "0123456789abcdef";
     const int n0 = is_big_endian() ? 0 : sizeof x - 1;
     const int dn = is_big_endian() ? +1 : -1;
     const int n1 = n0 + sizeof x * dn;
-    for (int n=n0; n!=n1; n+=dn) {
-      buf << hexdigits[cs[n]>>4] << hexdigits[cs[n]&15];
+    for (int n = n0; n != n1; n += dn) {
+      buf << hexdigits[cs[n] >> 4] << hexdigits[cs[n] & 15];
     }
     return buf.str();
   }
-  
-  
-  
-  static boolvec_t supported(realvec_t x)
-  {
-    return x==RV(0.0) || MF::vml_ieee_isnormal(x)
+
+  static boolvec_t supported(realvec_t x) {
+    return x == RV(0.0) || MF::vml_ieee_isnormal(x)
 #ifdef VML_HAVE_DENORMALS
-      || MF::vml_ieee_isfinite(x)
+           || MF::vml_ieee_isfinite(x)
 #endif
 #ifdef VML_HAVE_INF
-      || MF::vml_ieee_isinf(x)
+           || MF::vml_ieee_isinf(x)
 #endif
 #ifdef VML_HAVE_NAN
-      || MF::vml_ieee_isnan(x)
+           || MF::vml_ieee_isnan(x)
 #endif
-      ;
-  }
-  
-  static boolvec_t supported(intvec_t x)
-  {
-    return true;
-  }
-  
-  static boolvec_t supported(boolvec_t x)
-  {
-    return true;
+        ;
   }
-  
-  
-  
+
+  static boolvec_t supported(intvec_t x) { return true; }
+
+  static boolvec_t supported(boolvec_t x) { return true; }
+
   // Check load memory access
-  static void check_mem(const char* const func,
-                        const realvec_t x,
-                        const real_t* const p,
-                        const realvec_t xold,
-                        const int mval)
-  {
+  static void check_mem(const char *const func, const realvec_t x,
+                        const real_t *const p, const realvec_t xold,
+                        const int mval) {
     realvec_t xwant;
-    for (int i=0; i<realvec_t::size; ++i) {
-      xwant.set_elt(i, mval & (1<<i) ? p[i] : xold[i]);
+    for (int i = 0; i < realvec_t::size; ++i) {
+      xwant.set_elt(i, mval & (1 << i) ? p[i] : xold[i]);
     }
     const boolvec_t isbad = x != xwant;
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   found=" << x << " [" << hex(x) << "]\n"
            << "   expected=" << xwant << " [" << hex(xwant) << "]\n"
            << "   mval=" << mval << " [" << hex(mval) << "]\n"
-           << "   isbad=" << isbad << "\n"
-           << flush;
+           << "   isbad=" << isbad << "\n" << flush;
     }
   }
-  
+
   // Check store memory access
-  static void check_mem(const char* const func,
-                        const real_t* const p,
-                        const realvec_t x,
-                        const real_t* const pold,
-                        const int mval)
-  {
+  static void check_mem(const char *const func, const real_t *const p,
+                        const realvec_t x, const real_t *const pold,
+                        const int mval) {
     realvec_t pv, pvwant;
-    for (int i=0; i<realvec_t::size; ++i) {
+    for (int i = 0; i < realvec_t::size; ++i) {
       pv.set_elt(i, p[i]);
-      pvwant.set_elt(i, mval & (1<<i) ? x[i] : pold[i]);
+      pvwant.set_elt(i, mval & (1 << i) ? x[i] : pold[i]);
     }
     const boolvec_t isbad = pv != pvwant;
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   found=" << pv << " [" << hex(pv) << "]\n"
            << "   expected=" << pvwant << " [" << hex(pvwant) << "]\n"
-           << "   isbad=" << isbad << "\n"
-           << flush;
+           << "   isbad=" << isbad << "\n" << flush;
     }
   }
-  
-  static void check_bool(const char* const func,
-                         const bool rstd, const bool rvml)
-  {
+
+  static void check_bool(const char *const func, const bool rstd,
+                         const bool rvml) {
     const bool dr = rstd ^ rvml;
     const bool isbad = dr;
     if (isbad) {
-      ++ num_errors;
+      ++num_errors;
       cout << "Error in " << func << ":\n"
            << "   fstd()=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml()=" << rvml << " [" << hex(rvml) << "]\n"
-           << "   isbad()=" << isbad << "\n"
-           << flush;
+           << "   isbad()=" << isbad << "\n" << flush;
     }
   }
-  
-  template<typename A>
-  static void check_bool(const char* const func,
-                         const bool rstd, const bool rvml, const A x)
-  {
+
+  template <typename A>
+  static void check_bool(const char *const func, const bool rstd,
+                         const bool rvml, const A x) {
     const bool dr = rstd ^ rvml;
     const bool isbad = dr;
     if (isbad) {
-      ++ num_errors;
+      ++num_errors;
       cout << "Error in " << func << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
-           << "   isbad(x)=" << isbad << "\n"
-           << flush;
+           << "   isbad(x)=" << isbad << "\n" << flush;
     }
   }
-  
-  template<typename A>
-  static void check_bool(const char* const func,
-                         const boolvec_t rstd, const boolvec_t rvml,
-                         const A x)
-  {
+
+  template <typename A>
+  static void check_bool(const char *const func, const boolvec_t rstd,
+                         const boolvec_t rvml, const A x) {
     boolvec_t dr;
     bool isbad = false;
-    for (int i=0; i<realvec_t::size; ++i) {
+    for (int i = 0; i < realvec_t::size; ++i) {
       dr.set_elt(i, rstd[i] ^ rvml[i]);
       isbad |= dr[i];
     }
     if (isbad) {
-      ++ num_errors;
+      ++num_errors;
       cout << "Error in " << func << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
            << "   error(x)=" << dr << " [" << hex(rvml) << "]\n"
-           << "   isbad(x)=" << isbad << "\n"
-           << flush;
+           << "   isbad(x)=" << isbad << "\n" << flush;
     }
   }
-  
-  template<typename A, typename B>
-  static void check_bool(const char* const func,
-                         const boolvec_t rstd, const boolvec_t rvml,
-                         const A x, const B y)
-  {
+
+  template <typename A, typename B>
+  static void check_bool(const char *const func, const boolvec_t rstd,
+                         const boolvec_t rvml, const A x, const B y) {
     boolvec_t dr;
     bool isbad = false;
-    for (int i=0; i<realvec_t::size; ++i) {
+    for (int i = 0; i < realvec_t::size; ++i) {
       dr.set_elt(i, rstd[i] ^ rvml[i]);
       isbad |= dr[i];
     }
     if (isbad) {
-      ++ num_errors;
+      ++num_errors;
       cout << "Error in " << func << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   y=" << y << " [" << hex(y) << "]\n"
            << "   fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
            << "   error(x,y)=" << dr << " [" << hex(rvml) << "]\n"
-           << "   isbad(x,y)=" << isbad << "\n"
-           << flush;
+           << "   isbad(x,y)=" << isbad << "\n" << flush;
     }
   }
-  
-  template<typename A>
-  static void check_bool(const char* const func,
-                         bool fstd(typename A::scalar_t x),
-                         boolvec_t fvml(A x),
-                         const A x)
-  {
+
+  template <typename A>
+  static void check_bool(const char *const func,
+                         bool fstd(typename A::scalar_t x), boolvec_t fvml(A x),
+                         const A x) {
     boolvec_t rstd;
-    for (int i=0; i<boolvec_t::size; ++i) {
+    for (int i = 0; i < boolvec_t::size; ++i) {
       rstd.set_elt(i, fstd(x[i]));
     }
     const boolvec_t rvml = fvml(x);
     const boolvec_t dr = rstd != rvml;
     const boolvec_t isbad = supported(x) && supported(rstd) && dr;
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
            << "   error(x)=" << dr << " [" << hex(dr) << "]\n"
-           << "   isbad(x)=" << isbad << "\n"
-           << flush;
+           << "   isbad(x)=" << isbad << "\n" << flush;
     }
   }
-  
-  template<typename A, typename B>
-  static void check_bool(const char* const func,
+
+  template <typename A, typename B>
+  static void check_bool(const char *const func,
                          bool fstd(typename A::scalar_t x,
                                    typename B::scalar_t y),
-                         boolvec_t fvml(A x, B y),
-                         const A x, const B y)
-  {
+                         boolvec_t fvml(A x, B y), const A x, const B y) {
     boolvec_t rstd;
-    for (int i=0; i<boolvec_t::size; ++i) {
+    for (int i = 0; i < boolvec_t::size; ++i) {
       rstd.set_elt(i, fstd(x[i], y[i]));
     }
     const boolvec_t rvml = fvml(x, y);
     const boolvec_t dr = rstd != rvml;
     const boolvec_t isbad = supported(x) && supported(rstd) && dr;
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   y=" << y << " [" << hex(y) << "]\n"
            << "   fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
            << "   error(x,y)=" << dr << " [" << hex(dr) << "]\n"
-           << "   isbad(x,y)=" << isbad << "\n"
-           << flush;
+           << "   isbad(x,y)=" << isbad << "\n" << flush;
     }
   }
-  
-  template<typename A, typename B, typename C>
-  static void check_bool(const char* const func,
-                         bool fstd(typename A::scalar_t x,
-                                   typename B::scalar_t y,
-                                   typename C::scalar_t z),
-                         boolvec_t fvml(A x, B y, C z),
-                         const A x, const B y, const C z)
-  {
+
+  template <typename A, typename B, typename C>
+  static void
+  check_bool(const char *const func,
+             bool fstd(typename A::scalar_t x, typename B::scalar_t y,
+                       typename C::scalar_t z),
+             boolvec_t fvml(A x, B y, C z), const A x, const B y, const C z) {
     boolvec_t rstd;
-    for (int i=0; i<boolvec_t::size; ++i) {
+    for (int i = 0; i < boolvec_t::size; ++i) {
       rstd.set_elt(i, fstd(x[i], y[i], z[i]));
     }
     const boolvec_t rvml = fvml(x, y, z);
     const boolvec_t dr = rstd != rvml;
     const boolvec_t isbad = supported(x) && supported(rstd) && dr;
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   y=" << y << " [" << hex(y) << "]\n"
            << "   z=" << z << " [" << hex(z) << "]\n"
            << "   fstd(x,y,z)=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml(x,y,z)=" << rvml << " [" << hex(rvml) << "]\n"
            << "   error(x,y,z)=" << dr << " [" << hex(dr) << "]\n"
-           << "   isbad(x,y,z)=" << isbad << "\n"
-           << flush;
+           << "   isbad(x,y,z)=" << isbad << "\n" << flush;
     }
   }
-  
-  static void check_int(const char* const func,
-                        const int_t rstd, const int_t rvml)
-  {
+
+  static void check_int(const char *const func, const int_t rstd,
+                        const int_t rvml) {
     const int_t dr = rstd - rvml;
     const bool isbad = dr;
     if (isbad) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   fstd()=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml()=" << rvml << " [" << hex(rvml) << "]\n"
            << "   error()=" << dr << " [" << hex(dr) << "]\n"
-           << "   isbad()=" << isbad << "\n"
-           << flush;
+           << "   isbad()=" << isbad << "\n" << flush;
     }
   }
-  
-  template<typename A>
-  static void check_int(const char* const func,
-                        int_t fstd(typename A::scalar_t x),
-                        intvec_t fvml(A x),
-                        const A x)
-  {
+
+  template <typename A>
+  static void check_int(const char *const func,
+                        int_t fstd(typename A::scalar_t x), intvec_t fvml(A x),
+                        const A x) {
     intvec_t rstd;
-    for (int i=0; i<intvec_t::size; ++i) {
+    for (int i = 0; i < intvec_t::size; ++i) {
       rstd.set_elt(i, fstd(x[i]));
     }
     const intvec_t rvml = fvml(x);
     const intvec_t dr = rstd - rvml;
     const boolvec_t isbad = supported(x) && supported(rstd) && convert_bool(dr);
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
            << "   error(x)=" << dr << " [" << hex(dr) << "]\n"
-           << "   isbad(x)=" << isbad << "\n"
-           << flush;
+           << "   isbad(x)=" << isbad << "\n" << flush;
     }
   }
-  
-  template<typename A, typename B>
-  static void check_int(const char* const func,
+
+  template <typename A, typename B>
+  static void check_int(const char *const func,
                         int_t fstd(typename A::scalar_t x, B y),
-                        intvec_t fvml(A x, B y),
-                        const A x, const B y)
-  {
+                        intvec_t fvml(A x, B y), const A x, const B y) {
     intvec_t rstd;
-    for (int i=0; i<intvec_t::size; ++i) {
+    for (int i = 0; i < intvec_t::size; ++i) {
       rstd.set_elt(i, fstd(x[i], y));
     }
     const intvec_t rvml = fvml(x, y);
     const intvec_t dr = rstd - rvml;
     const boolvec_t isbad = supported(x) && supported(rstd) && convert_bool(dr);
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   y=" << y << " [" << hex(y) << "]\n"
            << "   fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
            << "   error(x,y)=" << dr << " [" << hex(dr) << "]\n"
-           << "   isbad(x,y)=" << isbad << "\n"
-           << flush;
+           << "   isbad(x,y)=" << isbad << "\n" << flush;
     }
   }
-  
-  template<typename A, typename B>
-  static void check_int(const char* const func,
+
+  template <typename A, typename B>
+  static void check_int(const char *const func,
                         int_t fstd(typename A::scalar_t x,
                                    typename B::scalar_t y),
-                        intvec_t fvml(A x, B y),
-                        const A x, const B y)
-  {
+                        intvec_t fvml(A x, B y), const A x, const B y) {
     intvec_t rstd;
-    for (int i=0; i<intvec_t::size; ++i) {
+    for (int i = 0; i < intvec_t::size; ++i) {
       rstd.set_elt(i, fstd(x[i], y[i]));
     }
     const intvec_t rvml = fvml(x, y);
     const intvec_t dr = rstd - rvml;
     const boolvec_t isbad =
-      supported(x) && supported(y) && supported(rstd) && convert_bool(dr);
+        supported(x) && supported(y) && supported(rstd) && convert_bool(dr);
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   y=" << y << " [" << hex(y) << "]\n"
            << "   fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
            << "   error(x,y)=" << dr << " [" << hex(dr) << "]\n"
-           << "   isbad(x,y)=" << isbad << "\n"
-           << flush;
+           << "   isbad(x,y)=" << isbad << "\n" << flush;
     }
   }
-  
-  template<typename A, typename B, typename C>
-  static void check_int(const char* const func,
-                        int_t fstd(typename A::scalar_t x,
-                                   typename B::scalar_t y,
-                                   typename C::scalar_t z),
-                        intvec_t fvml(A x, B y, C z),
-                        const A x, const B y, const C z)
-  {
+
+  template <typename A, typename B, typename C>
+  static void
+  check_int(const char *const func,
+            int_t fstd(typename A::scalar_t x, typename B::scalar_t y,
+                       typename C::scalar_t z),
+            intvec_t fvml(A x, B y, C z), const A x, const B y, const C z) {
     intvec_t rstd;
-    for (int i=0; i<intvec_t::size; ++i) {
+    for (int i = 0; i < intvec_t::size; ++i) {
       rstd.set_elt(i, fstd(x[i], y[i], z[i]));
     }
     const intvec_t rvml = fvml(x, y, z);
     const intvec_t dr = rstd - rvml;
-    const boolvec_t isbad =
-      supported(x) && supported(y) && supported(z) && supported(rstd) &&
-      convert_bool(dr);
+    const boolvec_t isbad = supported(x) && supported(y) && supported(z) &&
+                            supported(rstd) && convert_bool(dr);
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   y=" << y << " [" << hex(y) << "]\n"
            << "   z=" << z << " [" << hex(z) << "]\n"
            << "   fstd(x,y,z)=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml(x,y,z)=" << rvml << " [" << hex(rvml) << "]\n"
            << "   error(x,y,z)=" << dr << " [" << hex(dr) << "]\n"
-           << "   isbad(x,y,z)=" << isbad << "\n"
-           << flush;
+           << "   isbad(x,y,z)=" << isbad << "\n" << flush;
     }
   }
-  
-  static void check_real(const char* const func,
-                         const real_t rstd, const real_t rvml)
-  {
+
+  static void check_real(const char *const func, const real_t rstd,
+                         const real_t rvml) {
     const real_t dr = rstd - rvml;
     const bool isbad = dr != R(0.0);
     if (isbad) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << "():\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << "():\n"
            << "   fstd()=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml()=" << rvml << " [" << hex(rvml) << "]\n"
            << "   error()=" << dr << "\n"
-           << "   isbad()=" << isbad << "\n"
-           << flush;
+           << "   isbad()=" << isbad << "\n" << flush;
     }
   }
-  
-  template<typename A>
-  static void check_real(const char* const func,
-                         const real_t rstd, const real_t rvml, const A x,
-                         const real_t accuracy)
-  {
+
+  template <typename A>
+  static void check_real(const char *const func, const real_t rstd,
+                         const real_t rvml, const A x, const real_t accuracy) {
     const real_t dr = rstd - rvml;
     real_t maxabs = 0.0;
-    for (int i=0; i<realvec_t::size; ++i) {
+    for (int i = 0; i < realvec_t::size; ++i) {
       maxabs = vml_std::fmax(maxabs, vml_std::fabs(x[i]));
     }
     const real_t scale = fabs(rstd) + fabs(rvml) + fabs(maxabs) + R(1.0);
     const bool isbad = fabs(dr) > accuracy * scale;
     if (isbad) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << "():\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << "():\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
            << "   error(x)=" << dr << "\n"
-           << "   isbad(x)=" << isbad << "\n"
-           << flush;
+           << "   isbad(x)=" << isbad << "\n" << flush;
     }
   }
-  
-  template<typename A>
-  static void check_real(const char* const func,
-                         real_t fstd(typename A::scalar_t x),
-                         realvec_t fvml(A x),
-                         const A x,
-                         const real_t accuracy)
-  {
+
+  template <typename A>
+  static void
+  check_real(const char *const func, real_t fstd(typename A::scalar_t x),
+             realvec_t fvml(A x), const A x, const real_t accuracy) {
     realvec_t rstd;
-    for (int i=0; i<realvec_t::size; ++i) {
+    for (int i = 0; i < realvec_t::size; ++i) {
       rstd.set_elt(i, fstd(x[i]));
     }
     const realvec_t rvml = fvml(x);
     const realvec_t dr = rstd - rvml;
     const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0);
-    const boolvec_t isbad =
-      supported(x) && supported(rstd) &&
-      fabs(dr) > realvec_t(accuracy) * scale;
+    const boolvec_t isbad = supported(x) && supported(rstd) &&
+                            fabs(dr) > realvec_t(accuracy) * scale;
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
            << "   fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
            << "   abs-error(x)=" << fabs(dr) << "\n"
            << "   rel-error(x)=" << fabs(dr) / scale << "\n"
            << "   isbad(x)=" << isbad << "\n"
-           << "   accuracy=" << accuracy << "\n"
-           << flush;
+           << "   accuracy=" << accuracy << "\n" << flush;
     }
   }
-  
-  template<typename A, typename B>
-  static void check_real(const char* const func,
+
+  template <typename A, typename B>
+  static void check_real(const char *const func,
                          real_t fstd(typename A::scalar_t x, B y),
-                         realvec_t fvml(A x, B y),
-                         const A x, const B y,
-                         const real_t accuracy)
-  {
+                         realvec_t fvml(A x, B y), const A x, const B y,
+                         const real_t accuracy) {
     realvec_t rstd;
-    for (int i=0; i<realvec_t::size; ++i) {
+    for (int i = 0; i < realvec_t::size; ++i) {
       rstd.set_elt(i, fstd(x[i], y));
     }
     const realvec_t rvml = fvml(x, y);
     const realvec_t dr = rstd - rvml;
     const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0);
-    const boolvec_t isbad =
-      supported(x) && supported(rstd) && fabs(dr) > realvec_t(accuracy) * scale;
+    const boolvec_t isbad = supported(x) && supported(rstd) &&
+                            fabs(dr) > realvec_t(accuracy) * scale;
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   y=" << y << " [" << hex(y) << "]\n"
            << "   fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
@@ -583,38 +500,32 @@ struct vecmathlib_test {
            << "   abs-error(x,y)=" << fabs(dr) << "\n"
            << "   rel-error(x,y)=" << fabs(dr) / scale << "\n"
            << "   isbad(x,y)=" << isbad << "\n"
-           << "   accuracy=" << accuracy << "\n"
-           << flush;
+           << "   accuracy=" << accuracy << "\n" << flush;
     }
   }
-  
-  template<typename A, typename B>
-  static void check_real(const char* const func,
-                         real_t fstd(typename A::scalar_t x,
-                                     typename B::scalar_t y),
-                         realvec_t fvml(A x, B y),
-                         const A x, const B y,
-                         const real_t accuracy,
-                         const realvec_t offset = RV(0.0))
-  {
+
+  template <typename A, typename B>
+  static void
+  check_real(const char *const func,
+             real_t fstd(typename A::scalar_t x, typename B::scalar_t y),
+             realvec_t fvml(A x, B y), const A x, const B y,
+             const real_t accuracy, const realvec_t offset = RV(0.0)) {
     realvec_t rstd;
-    for (int i=0; i<realvec_t::size; ++i) {
+    for (int i = 0; i < realvec_t::size; ++i) {
       rstd.set_elt(i, fstd(x[i], y[i]));
     }
     realvec_t rvml = fvml(x, y);
     // Fix up rvml by adding/subtracting the offset
-    rvml = ifthen(fabs(rstd-rvml)>fabs(offset/RV(2.0)),
-                  rvml + copysign(offset, rstd-rvml),
-                  rvml);
+    rvml = ifthen(fabs(rstd - rvml) > fabs(offset / RV(2.0)),
+                  rvml + copysign(offset, rstd - rvml), rvml);
     const realvec_t dr = rstd - rvml;
     const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0);
-    const boolvec_t isbad =
-      supported(x) && supported(y) && supported(rstd) &&
-      fabs(dr) > realvec_t(accuracy) * scale;
+    const boolvec_t isbad = supported(x) && supported(y) && supported(rstd) &&
+                            fabs(dr) > realvec_t(accuracy) * scale;
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   y=" << y << " [" << hex(y) << "]\n"
            << "   fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
@@ -622,34 +533,31 @@ struct vecmathlib_test {
            << "   abs-error(x,y)=" << fabs(dr) << "\n"
            << "   rel-error(x,y)=" << fabs(dr) / scale << "\n"
            << "   isbad(x,y)=" << isbad << "\n"
-           << "   accuracy=" << accuracy << "\n"
-           << flush;
+           << "   accuracy=" << accuracy << "\n" << flush;
     }
   }
-  
-  template<typename A, typename B, typename C>
-  static void check_real(const char* const func,
+
+  template <typename A, typename B, typename C>
+  static void check_real(const char *const func,
                          real_t fstd(typename A::scalar_t x,
                                      typename B::scalar_t y,
                                      typename C::scalar_t z),
-                         realvec_t fvml(A x, B y, C z),
-                         const A x, const B y, C const z,
-                         const real_t accuracy)
-  {
+                         realvec_t fvml(A x, B y, C z), const A x, const B y,
+                         C const z, const real_t accuracy) {
     realvec_t rstd;
-    for (int i=0; i<realvec_t::size; ++i) {
+    for (int i = 0; i < realvec_t::size; ++i) {
       rstd.set_elt(i, fstd(x[i], y[i], z[i]));
     }
     const realvec_t rvml = fvml(x, y, z);
     const realvec_t dr = rstd - rvml;
     const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0);
-    const boolvec_t isbad =
-      supported(x) && supported(y) && supported(z) && supported(rstd) &&
-      fabs(dr) > realvec_t(accuracy) * scale;
+    const boolvec_t isbad = supported(x) && supported(y) && supported(z) &&
+                            supported(rstd) &&
+                            fabs(dr) > realvec_t(accuracy) * scale;
     if (any(isbad)) {
-      ++ num_errors;
-      cout << setprecision(realvec_t::digits10+2)
-           << "Error in " << func << ":\n"
+      ++num_errors;
+      cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+           << ":\n"
            << "   x=" << x << " [" << hex(x) << "]\n"
            << "   y=" << y << " [" << hex(y) << "]\n"
            << "   z=" << z << " [" << hex(z) << "]\n"
@@ -658,61 +566,57 @@ struct vecmathlib_test {
            << "   abs-error(x,y,z)=" << fabs(dr) << "\n"
            << "   rel-error(x,y,z)=" << fabs(dr) / scale << "\n"
            << "   isbad(x,y,z)=" << isbad << "\n"
-           << "   accuracy=" << accuracy << "\n"
-           << flush;
+           << "   accuracy=" << accuracy << "\n" << flush;
     }
   }
-  
-  
-  
-  static real_t* align_mem(real_t* p)
-  {
+
+  static real_t *align_mem(real_t *p) {
     const ptrdiff_t alignment = sizeof(realvec_t);
-    p = (real_t*)((intptr_t(p) + alignment-1) & -alignment);
+    p = (real_t *)((intptr_t(p) + alignment - 1) & -alignment);
     assert(intptr_t(p) % alignment == 0);
     return p;
   }
-  static string add_suffix(const char* str, int i)
-  {
+  static string add_suffix(const char *str, int i) {
     ostringstream buf;
     buf << str << "." << i;
     return buf.str();
   }
-  static void test_mem()
-  {
-    cout << "   testing loada loadu storea storeu (errors may lead to segfaults)...\n" << flush;
+  static void test_mem() {
+    cout << "   testing loada loadu storea storeu (errors may lead to "
+            "segfaults)...\n"
+         << flush;
     const int n = 4;
     const int sz = realvec_t::size;
-    const int nbytes = n*sz*sizeof(real_t);
-    real_t* const x = align_mem(new real_t[(n+1)*sz]);
-    real_t* const xnew = align_mem(new real_t[(n+1)*sz]);
-    for (int i=0; i<n; ++i) {
+    const int nbytes = n * sz * sizeof(real_t);
+    real_t *const x = align_mem(new real_t[(n + 1) * sz]);
+    real_t *const xnew = align_mem(new real_t[(n + 1) * sz]);
+    for (int i = 0; i < n; ++i) {
       realvec_t xv = random(R(-10.0), R(+10.0));
-      memcpy(&x[i*sz], &xv, sizeof xv);
+      memcpy(&x[i * sz], &xv, sizeof xv);
     }
     const realvec_t z = random(R(-10.0), R(+10.0));
-    
+
     // loada
     {
       const real_t *p = &x[sz];
       realvec_t y = realvec_t::loada(p);
       check_mem("loada", y, p, z, ~0);
     }
-    
+
     // loadu
-    for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+    for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
       const real_t *p = &x[sz];
-      realvec_t y = realvec_t::loadu(p+i);
-      check_mem(add_suffix("loadu", i).c_str(), y, p+i, z, ~0);
+      realvec_t y = realvec_t::loadu(p + i);
+      check_mem(add_suffix("loadu", i).c_str(), y, p + i, z, ~0);
     }
-    
+
     // loadu(ioff)
-    for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+    for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) {
       const real_t *p = &x[sz];
       realvec_t y = realvec_t::loadu(p, ioff);
-      check_mem(add_suffix("loadu(ioff)", ioff).c_str(), y, p+ioff, z, ~0);
+      check_mem(add_suffix("loadu(ioff)", ioff).c_str(), y, p + ioff, z, ~0);
     }
-    
+
     // storea
     {
       memcpy(xnew, x, nbytes);
@@ -720,50 +624,51 @@ struct vecmathlib_test {
       storea(z, p);
       check_mem("storea", p, z, &x[sz], ~0);
     }
-    
+
     // storeu
-    for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+    for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
       memcpy(xnew, x, nbytes);
       real_t *p = &xnew[sz];
-      storeu(z, p+i);
-      check_mem(add_suffix("storeu", i).c_str(), p+i, z, &x[sz+i], ~0);
+      storeu(z, p + i);
+      check_mem(add_suffix("storeu", i).c_str(), p + i, z, &x[sz + i], ~0);
     }
-    
+
     // storeu
-    for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+    for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) {
       memcpy(xnew, x, nbytes);
       real_t *p = &xnew[sz];
       storeu(z, p, ioff);
-      check_mem(add_suffix("storeu(ioff)", ioff).c_str(),
-                p+ioff, z, &x[sz+ioff], ~0);
+      check_mem(add_suffix("storeu(ioff)", ioff).c_str(), p + ioff, z,
+                &x[sz + ioff], ~0);
     }
-    
-    for (int mval=0; mval<(1<<realvec_t::size); ++mval) {
+
+    for (int mval = 0; mval < (1 << realvec_t::size); ++mval) {
       boolvec_t mbool;
-      for (int i=0; i<realvec_t::size; ++i) mbool.set_elt(i, mval & (1<<i));
+      for (int i = 0; i < realvec_t::size; ++i)
+        mbool.set_elt(i, mval & (1 << i));
       typename realvec_t::mask_t mask(mbool);
-      
+
       // loada(mask)
       {
         const real_t *p = &x[sz];
         realvec_t y = loada(p, z, mask);
         check_mem("loada(mask)", y, p, z, mval);
       }
-      
+
       // loadu(mask)
-      for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+      for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
         const real_t *p = &x[sz];
-        realvec_t y = loadu(p+i, z, mask);
-        check_mem("loadu(mask)", y, p+i, z, mval);
+        realvec_t y = loadu(p + i, z, mask);
+        check_mem("loadu(mask)", y, p + i, z, mval);
       }
-      
+
       // loadu(ioff, mask)
-      for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+      for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) {
         const real_t *p = &x[sz];
         realvec_t y = loadu(p, ioff, z, mask);
-        check_mem("loadu(ioff,mask)", y, p+ioff, z, mval);
+        check_mem("loadu(ioff,mask)", y, p + ioff, z, mval);
       }
-      
+
       // storea
       {
         memcpy(xnew, x, nbytes);
@@ -771,37 +676,35 @@ struct vecmathlib_test {
         storea(z, p, mask);
         check_mem("storea(mask)", p, z, &x[sz], mval);
       }
-      
+
       // storeu
-      for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+      for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
         memcpy(xnew, x, nbytes);
         real_t *p = &xnew[sz];
-        storeu(z, p+i, mask);
-        check_mem("storeu(mask)", p+i, z, &x[sz+i], mval);
+        storeu(z, p + i, mask);
+        check_mem("storeu(mask)", p + i, z, &x[sz + i], mval);
       }
-      
+
       // storeu
-      for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+      for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) {
         memcpy(xnew, x, nbytes);
         real_t *p = &xnew[sz];
         storeu(z, p, ioff, mask);
-        check_mem("storeu(ioff,mask)", p+ioff, z, &x[sz+ioff], mval);
+        check_mem("storeu(ioff,mask)", p + ioff, z, &x[sz + ioff], mval);
       }
-      
+
     } // for mval
   }
-  
-  
-  
-  template<typename T>
-  static T local_ifthen(bool b, T x, T y) { return b ? x : y; }
-  static void test_bool()
-  {
+
+  template <typename T> static T local_ifthen(bool b, T x, T y) {
+    return b ? x : y;
+  }
+  static void test_bool() {
     cout << "   testing boolean operations...\n" << flush;
-    
+
     const boolvec_t bf = boolvec_t(false);
     const boolvec_t bt = boolvec_t(true);
-    for (int i=0; i<realvec_t::size; ++i) {
+    for (int i = 0; i < realvec_t::size; ++i) {
       check_bool("false", false, bf[i]);
       check_bool("true", true, bt[i]);
     }
@@ -809,32 +712,32 @@ struct vecmathlib_test {
     check_bool("all", true, all(bt), true);
     check_bool("any", false, any(bf), false);
     check_bool("any", true, any(bt), true);
-    
+
     boolvec_t b0 = bt;
     boolvec_t b1 = bf;
-    for (int n=0; n<realvec_t::size; ++n) {
+    for (int n = 0; n < realvec_t::size; ++n) {
       b0.set_elt(n, false);
       b1.set_elt(n, true);
-      for (int i=0; i<realvec_t::size; ++i) {
-        check_bool("set_elt", i<=n ? false : true, b0[i], false);
-        check_bool("set_elt", i<=n ? true : false, b1[i], true);
+      for (int i = 0; i < realvec_t::size; ++i) {
+        check_bool("set_elt", i <= n ? false : true, b0[i], false);
+        check_bool("set_elt", i <= n ? true : false, b1[i], true);
       }
     }
-    
-    for (int n=0; n<(1<<realvec_t::size); ++n) {
+
+    for (int n = 0; n < (1 << realvec_t::size); ++n) {
       boolvec_t x;
-      for (int i=0; i<realvec_t::size; ++i) {
-        x.set_elt(i, n & (1<<i));
+      for (int i = 0; i < realvec_t::size; ++i) {
+        x.set_elt(i, n & (1 << i));
       }
-      for (int i=0; i<realvec_t::size; ++i) {
-        bool rstd = n & (1<<i);
+      for (int i = 0; i < realvec_t::size; ++i) {
+        bool rstd = n & (1 << i);
         bool rvml = x[i];
         check_bool("[]", rstd, rvml, x);
       }
-      
+
       {
         boolvec_t rstd;
-        for (int i=0; i<realvec_t::size; ++i) {
+        for (int i = 0; i < realvec_t::size; ++i) {
           rstd.set_elt(i, !x[i]);
         }
         boolvec_t rvml = !x;
@@ -842,7 +745,7 @@ struct vecmathlib_test {
       }
       {
         bool rstd = x[0];
-        for (int i=1; i<realvec_t::size; ++i) {
+        for (int i = 1; i < realvec_t::size; ++i) {
           rstd &= x[i];
         }
         bool rvml = all(x);
@@ -850,39 +753,36 @@ struct vecmathlib_test {
       }
       {
         bool rstd = x[0];
-        for (int i=1; i<realvec_t::size; ++i) {
+        for (int i = 1; i < realvec_t::size; ++i) {
           rstd |= x[i];
         }
         bool rvml = any(x);
         check_bool("any", rstd, rvml, x);
       }
-      check_bool
-        ("ifthen(bool)",
-         local_ifthen<bool>,
-         (boolvec_t(*)(boolvec_t,boolvec_t,boolvec_t))vecmathlib::ifthen,
-         x, BV(false), BV(true));
-      check_int("ifthen(int)",
-                local_ifthen<int_t>,
-                (intvec_t(*)(boolvec_t,intvec_t,intvec_t))vecmathlib::ifthen,
+      check_bool(
+          "ifthen(bool)", local_ifthen<bool>,
+          (boolvec_t (*)(boolvec_t, boolvec_t, boolvec_t))vecmathlib::ifthen, x,
+          BV(false), BV(true));
+      check_int("ifthen(int)", local_ifthen<int_t>,
+                (intvec_t (*)(boolvec_t, intvec_t, intvec_t))vecmathlib::ifthen,
                 x, IV(I(1)), IV(I(2)));
-      check_real("ifthen(real)",
-                 local_ifthen<real_t>,
-                 ((realvec_t(*)(boolvec_t,realvec_t,realvec_t))
-                  vecmathlib::ifthen),
-                x, RV(1.0), RV(2.0), R(0.0));
-    }
-    
-    for (int n=0; n<(1<<realvec_t::size); ++n) {
-      for (int m=0; m<(1<<realvec_t::size); ++m) {
+      check_real(
+          "ifthen(real)", local_ifthen<real_t>,
+          ((realvec_t (*)(boolvec_t, realvec_t, realvec_t))vecmathlib::ifthen),
+          x, RV(1.0), RV(2.0), R(0.0));
+    }
+
+    for (int n = 0; n < (1 << realvec_t::size); ++n) {
+      for (int m = 0; m < (1 << realvec_t::size); ++m) {
         boolvec_t x, y;
-        for (int i=0; i<realvec_t::size; ++i) {
-          x.set_elt(i, n & (1<<i));
-          y.set_elt(i, m & (1<<i));
+        for (int i = 0; i < realvec_t::size; ++i) {
+          x.set_elt(i, n & (1 << i));
+          y.set_elt(i, m & (1 << i));
         }
-        
+
         {
           boolvec_t rstd;
-          for (int i=0; i<realvec_t::size; ++i) {
+          for (int i = 0; i < realvec_t::size; ++i) {
             rstd.set_elt(i, x[i] && y[i]);
           }
           boolvec_t rvml = x && y;
@@ -890,7 +790,7 @@ struct vecmathlib_test {
         }
         {
           boolvec_t rstd;
-          for (int i=0; i<realvec_t::size; ++i) {
+          for (int i = 0; i < realvec_t::size; ++i) {
             rstd.set_elt(i, x[i] || y[i]);
           }
           boolvec_t rvml = x || y;
@@ -898,7 +798,7 @@ struct vecmathlib_test {
         }
         {
           boolvec_t rstd;
-          for (int i=0; i<realvec_t::size; ++i) {
+          for (int i = 0; i < realvec_t::size; ++i) {
             rstd.set_elt(i, x[i] == y[i]);
           }
           boolvec_t rvml = x == y;
@@ -906,7 +806,7 @@ struct vecmathlib_test {
         }
         {
           boolvec_t rstd;
-          for (int i=0; i<realvec_t::size; ++i) {
+          for (int i = 0; i < realvec_t::size; ++i) {
             rstd.set_elt(i, x[i] != y[i]);
           }
           boolvec_t rvml = x != y;
@@ -915,322 +815,374 @@ struct vecmathlib_test {
       }
     }
   }
-  
-  
-  
+
   static bool local_convert_bool(int_t x) { return x; }
   static int_t local_convert_int(bool x) { return x; }
-  template<typename T> static T local_pos(T x) { return +x; }
-  template<typename T> static T local_neg(T x) { return -x; }
-  template<typename T> static T local_not(T x) { return ~x; }
-  template<typename T> static T local_add(T x, T y) { return x+y; }
-  template<typename T> static T local_sub(T x, T y) { return x-y; }
-  template<typename T> static T local_mul(T x, T y) { return x*y; }
-  template<typename T> static T local_div(T x, T y) { return x/y; }
-  template<typename T> static T local_mod(T x, T y) { return x%y; }
-  template<typename T> static T local_and(T x, T y) { return x&y; }
-  template<typename T> static T local_or(T x, T y) { return x|y; }
-  template<typename T> static T local_xor(T x, T y) { return x^y; }
-  
-  static int_t local_lsr(int_t x, int_t y) { return uint_t(x)>>uint_t(y); }
-  template<typename T> static T local_srs(T x, typename T::scalar_t y)
-  {
-    return x>>y;
+  template <typename T> static T local_pos(T x) { return +x; }
+  template <typename T> static T local_neg(T x) { return -x; }
+  template <typename T> static T local_not(T x) { return ~x; }
+  template <typename T> static T local_add(T x, T y) { return x + y; }
+  template <typename T> static T local_sub(T x, T y) { return x - y; }
+  template <typename T> static T local_mul(T x, T y) { return x * y; }
+  template <typename T> static T local_div(T x, T y) { return x / y; }
+  template <typename T> static T local_mod(T x, T y) { return x % y; }
+  template <typename T> static T local_and(T x, T y) { return x & y; }
+  template <typename T> static T local_or(T x, T y) { return x | y; }
+  template <typename T> static T local_xor(T x, T y) { return x ^ y; }
+
+  static int_t local_lsr(int_t x, int_t y) { return uint_t(x) >> uint_t(y); }
+  template <typename T> static T local_srs(T x, typename T::scalar_t y) {
+    return x >> y;
   }
-  template<typename T> static T local_sls(T x, typename T::scalar_t y)
-  {
-    return x<<y;
+  template <typename T> static T local_sls(T x, typename T::scalar_t y) {
+    return x << y;
   }
-  template<typename T> static T local_sr(T x, T y) { return x>>y; }
-  template<typename T> static T local_sl(T x, T y) { return x<<y; }
-
-  template<typename T> static bool local_isignbit(T x) { return x<0; }
-  template<typename T> static bool local_eq(T x, T y) { return x==y; }
-  template<typename T> static bool local_ne(T x, T y) { return x!=y; }
-  template<typename T> static bool local_lt(T x, T y) { return x<y; }
-  template<typename T> static bool local_le(T x, T y) { return x<=y; }
-  template<typename T> static bool local_gt(T x, T y) { return x>y; }
-  template<typename T> static bool local_ge(T x, T y) { return x>=y; }
-  template<typename T> static boolvec_t local_veq(T x, T y) { return x==y; }
-  template<typename T> static boolvec_t local_vne(T x, T y) { return x!=y; }
-  template<typename T> static boolvec_t local_vlt(T x, T y) { return x<y; }
-  template<typename T> static boolvec_t local_vle(T x, T y) { return x<=y; }
-  template<typename T> static boolvec_t local_vgt(T x, T y) { return x>y; }
-  template<typename T> static boolvec_t local_vge(T x, T y) { return x>=y; }
-  static void test_int()
-  {
+  template <typename T> static T local_sr(T x, T y) { return x >> y; }
+  template <typename T> static T local_sl(T x, T y) { return x << y; }
+
+  template <typename T> static bool local_isignbit(T x) { return x < 0; }
+  template <typename T> static bool local_eq(T x, T y) { return x == y; }
+  template <typename T> static bool local_ne(T x, T y) { return x != y; }
+  template <typename T> static bool local_lt(T x, T y) { return x < y; }
+  template <typename T> static bool local_le(T x, T y) { return x <= y; }
+  template <typename T> static bool local_gt(T x, T y) { return x > y; }
+  template <typename T> static bool local_ge(T x, T y) { return x >= y; }
+  template <typename T> static boolvec_t local_veq(T x, T y) { return x == y; }
+  template <typename T> static boolvec_t local_vne(T x, T y) { return x != y; }
+  template <typename T> static boolvec_t local_vlt(T x, T y) { return x < y; }
+  template <typename T> static boolvec_t local_vle(T x, T y) { return x <= y; }
+  template <typename T> static boolvec_t local_vgt(T x, T y) { return x > y; }
+  template <typename T> static boolvec_t local_vge(T x, T y) { return x >= y; }
+  static void test_int() {
     cout << "   testing integer operations...\n" << flush;
-    
+
     intvec_t i0 = intvec_t(I(0));
     intvec_t i1 = intvec_t(I(1));
     intvec_t iiota = intvec_t::iota();
-    for (int i=0; i<realvec_t::size; ++i) {
+    for (int i = 0; i < realvec_t::size; ++i) {
       check_int("0", 0, i0[i]);
       check_int("1", 1, i1[i]);
       check_int("iota", i, iiota[i]);
     }
-    
+
     i0 = intvec_t(I(1));
     i1 = intvec_t(I(0));
-    for (int n=0; n<realvec_t::size; ++n) {
+    for (int n = 0; n < realvec_t::size; ++n) {
       i0.set_elt(n, 0);
       i1.set_elt(n, 1);
-      for (int i=0; i<realvec_t::size; ++i) {
-        check_bool("set_elt", i<=n ? 0 : 1, i0[i], 0);
-        check_bool("set_elt", i<=n ? 1 : 0, i1[i], 1);
+      for (int i = 0; i < realvec_t::size; ++i) {
+        check_bool("set_elt", i <= n ? 0 : 1, i0[i], 0);
+        check_bool("set_elt", i <= n ? 1 : 0, i1[i], 1);
       }
     }
-    
+
     const int_t int_min = std::numeric_limits<int_t>::min();
     const int_t int_max = std::numeric_limits<int_t>::max();
     const int_t values[] = {
-      0, 1, 2, 3, -1, -2, -3,
-      int_min, int_min+1, int_min+2, int_min+3,
-      int_max, int_max-1, int_max-2, int_max-3,
+        0,           1,       2,           3,           -1,
+        -2,          -3,      int_min,     int_min + 1, int_min + 2,
+        int_min + 3, int_max, int_max - 1, int_max - 2, int_max - 3,
     };
     const int nvalues = sizeof values / sizeof *values;
 
-    for (int i=0; i<nvalues*nvalues+2*imax; ++i) {
+    for (int i = 0; i < nvalues * nvalues + 2 * imax; ++i) {
       intvec_t x, y;
-      if (i<nvalues*nvalues) {
-        x = values[i%nvalues];
-        y = values[i/nvalues];
-      } else if (i<nvalues*nvalues+imax) {
+      if (i < nvalues * nvalues) {
+        x = values[i % nvalues];
+        y = values[i / nvalues];
+      } else if (i < nvalues * nvalues + imax) {
         x = random(I(-100), I(+100));
         y = random(I(-100), I(+100));
       } else {
-        x = random(int_min/2, int_max/2);
-        y = random(int_min/2, int_max/2);
+        x = random(int_min / 2, int_max / 2);
+        y = random(int_min / 2, int_max / 2);
       }
       boolvec_t b = convert_bool(random(I(0), I(1)));
-      
-      check_bool<IV>("convert_bool(int)",
-                     local_convert_bool, vecmathlib::convert_bool, x);
-      check_int<BV>("convert_int(bool)",
-                    local_convert_int, vecmathlib::convert_int, b);
-      
+
+      check_bool<IV>("convert_bool(int)", local_convert_bool,
+                     vecmathlib::convert_bool, x);
+      check_int<BV>("convert_int(bool)", local_convert_int,
+                    vecmathlib::convert_int, b);
+
       check_int<IV>("+", local_pos, local_pos, x);
       check_int<IV>("-", local_neg, local_neg, x);
       check_int<IV>("~", local_not, local_not, x);
-      
-      check_int<IV,IV>("+", local_add, local_add, x, y);
-      check_int<IV,IV>("-", local_sub, local_sub, x, y);
-      check_int<IV,IV>("&", local_and, local_and, x, y);
-      check_int<IV,IV>("|", local_or, local_or, x, y);
-      check_int<IV,IV>("^", local_xor, local_xor, x, y);
-      
-      const int_t bits = 8*sizeof(int_t);
-      check_int<IV,I>("lsr", local_lsr, vecmathlib::lsr, x, y[0] & (bits-1));
-      check_int<IV,I>(">>", local_sr, local_srs, x, y[0] & (bits-1));
-      check_int<IV,I>("<<", local_sl, local_sls, x, y[0] & (bits-1));
-      check_int<IV,IV>("lsr", local_lsr, vecmathlib::lsr, x, y & IV(bits-1));
-      check_int<IV,IV>(">>", local_sr, local_sr, x, y & IV(bits-1));
-      check_int<IV,IV>("<<", local_sl, local_sl, x, y & IV(bits-1));
-      
+
+      check_int<IV, IV>("+", local_add, local_add, x, y);
+      check_int<IV, IV>("-", local_sub, local_sub, x, y);
+      check_int<IV, IV>("&", local_and, local_and, x, y);
+      check_int<IV, IV>("|", local_or, local_or, x, y);
+      check_int<IV, IV>("^", local_xor, local_xor, x, y);
+
+      const int_t bits = 8 * sizeof(int_t);
+      check_int<IV, I>("lsr", local_lsr, vecmathlib::lsr, x, y[0] & (bits - 1));
+      check_int<IV, I>(">>", local_sr, local_srs, x, y[0] & (bits - 1));
+      check_int<IV, I>("<<", local_sl, local_sls, x, y[0] & (bits - 1));
+      check_int<IV, IV>("lsr", local_lsr, vecmathlib::lsr, x, y & IV(bits - 1));
+      check_int<IV, IV>(">>", local_sr, local_sr, x, y & IV(bits - 1));
+      check_int<IV, IV>("<<", local_sl, local_sl, x, y & IV(bits - 1));
+
       check_bool<IV>("isignbit", local_isignbit, vecmathlib::isignbit, x);
-      check_bool<IV,IV>("==", local_eq, local_veq, x, y);
-      check_bool<IV,IV>("!=", local_ne, local_vne, x, y);
-      check_bool<IV,IV>("<", local_lt, local_vlt, x, y);
-      check_bool<IV,IV>("<=", local_le, local_vle, x, y);
-      check_bool<IV,IV>(">", local_gt, local_vgt, x, y);
-      check_bool<IV,IV>(">=", local_ge, local_vge, x, y);
+      check_bool<IV, IV>("==", local_eq, local_veq, x, y);
+      check_bool<IV, IV>("!=", local_ne, local_vne, x, y);
+      check_bool<IV, IV>("<", local_lt, local_vlt, x, y);
+      check_bool<IV, IV>("<=", local_le, local_vle, x, y);
+      check_bool<IV, IV>(">", local_gt, local_vgt, x, y);
+      check_bool<IV, IV>(">=", local_ge, local_vge, x, y);
     }
   }
-  
-  static void test_real()
-  {
+
+  static void test_real() {
     cout << "   testing real operations...\n" << flush;
-    
+
     realvec_t r0 = realvec_t(0.0);
     realvec_t r1 = realvec_t(1.0);
-    for (int i=0; i<realvec_t::size; ++i) {
+    for (int i = 0; i < realvec_t::size; ++i) {
       check_real("0.0", R(0.0), r0[i]);
       check_real("1.0", R(1.0), r1[i]);
     }
-    
+
     r0 = realvec_t(1.0);
     r1 = realvec_t(0.0);
-    for (int n=0; n<realvec_t::size; ++n) {
+    for (int n = 0; n < realvec_t::size; ++n) {
       r0.set_elt(n, R(0.0));
       r1.set_elt(n, R(1.0));
-      for (int i=0; i<realvec_t::size; ++i) {
-        check_bool("set_elt", i<=n ? R(0.0) : R(1.0), r0[i], R(0.0));
-        check_bool("set_elt", i<=n ? R(1.0) : R(0.0), r1[i], R(1.0));
+      for (int i = 0; i < realvec_t::size; ++i) {
+        check_bool("set_elt", i <= n ? R(0.0) : R(1.0), r0[i], R(0.0));
+        check_bool("set_elt", i <= n ? R(1.0) : R(0.0), r1[i], R(1.0));
       }
     }
-    
+
     // barrier
     realvec_t rcancel = r1;
     rcancel += RV(R(FP::max() / 2));
     rcancel.barrier();
     rcancel -= RV(R(FP::max() / 2));
     check_real("barrier", R(0.0), rcancel[0]);
-    
+
     // rounding (break ties to even, or break ties away from zero?)
     realvec_t rbase = RV(R(1.0));
-    rbase += RV(FP::epsilon()/2);
+    rbase += RV(FP::epsilon() / 2);
     check_real("flt_rounds", R(1.0), rbase[0]);
     rbase = RV(R(1.0) + FP::epsilon());
-    rbase += RV(FP::epsilon()/2);
-    check_real("flt_rounds", R(1.0) + 2*FP::epsilon(), rbase[0]);
+    rbase += RV(FP::epsilon() / 2);
+    check_real("flt_rounds", R(1.0) + 2 * FP::epsilon(), rbase[0]);
   }
-  
-  static int_t local_bitifthen(int_t x, int_t y, int_t z)
-  {
+
+  static int_t local_bitifthen(int_t x, int_t y, int_t z) {
     return (x & y) | (~x & z);
   }
-  static int_t local_clz(int_t x)
-  {
+  static int_t local_clz(int_t x) {
     int bits = CHAR_BIT * sizeof(x);
     int res = 0;
-    for (; res<bits; ++res) {
-      if (x & (I(1) << (bits-res-1))) break;
+    for (; res < bits; ++res) {
+      if (x & (I(1) << (bits - res - 1)))
+        break;
     }
     return res;
   }
-  static int_t local_max(int_t x, int_t y)
-  {
-    return std::max(x, y);
-  }
-  static int_t local_min(int_t x, int_t y)
-  {
-    return std::min(x, y);
-  }
-  static int_t local_popcount(int_t x)
-  {
+  static int_t local_max(int_t x, int_t y) { return std::max(x, y); }
+  static int_t local_min(int_t x, int_t y) { return std::min(x, y); }
+  static int_t local_popcount(int_t x) {
     int bits = CHAR_BIT * sizeof(x);
     int res = 0;
-    for (int d=0; d<bits; ++d) {
-      if (x & (I(1) << d)) ++res;
+    for (int d = 0; d < bits; ++d) {
+      if (x & (I(1) << d))
+        ++res;
     }
     return res;
   }
-  static int_t local_rotate(int_t x, int_t n)
-  {
+  static int_t local_rotate(int_t x, int_t n) {
     int_t mask = CHAR_BIT * sizeof(int_t) - 1;
     int_t left = x << (n & mask);
     int_t right = I(U(x) >> U(-n & mask));
     return left | right;
   }
-  static void test_abs()
-  {
-    cout << "   testing abs bitifthen clz isignbit max min popcount rotate...\n" << flush;
-        
-    for (int i=0; i<imax; ++i) {
+  static void test_abs() {
+    cout << "   testing abs bitifthen clz isignbit max min popcount rotate...\n"
+         << flush;
+
+    for (int i = 0; i < imax; ++i) {
       const intvec_t x = random(I(-1000000), I(+1000000));
       const intvec_t y = random(I(-1000000), I(+1000000));
       const intvec_t z = random(I(-1000000), I(+1000000));
-      
+
       check_int<IV>("abs", std::abs, vecmathlib::abs, x);
-      check_int<IV,IV,IV>("bitifthen",
-                          local_bitifthen, vecmathlib::bitifthen, x, y, z);
+      check_int<IV, IV, IV>("bitifthen", local_bitifthen, vecmathlib::bitifthen,
+                            x, y, z);
       check_int<IV>("clz", local_clz, vecmathlib::clz, x);
-      check_int<IV,IV>("max", local_max, vecmathlib::max, x, y);
-      check_int<IV,IV>("min", local_min, vecmathlib::min, x, y);
+      check_int<IV, IV>("max", local_max, vecmathlib::max, x, y);
+      check_int<IV, IV>("min", local_min, vecmathlib::min, x, y);
       check_int<IV>("popcount", local_popcount, vecmathlib::popcount, x);
-      check_int<IV,IV>("rotate", local_rotate, vecmathlib::rotate, x, y[0]);
-      check_int<IV,IV>("rotate", local_rotate, vecmathlib::rotate, x, y);
+      check_int<IV, IV>("rotate", local_rotate, vecmathlib::rotate, x, y[0]);
+      check_int<IV, IV>("rotate", local_rotate, vecmathlib::rotate, x, y);
     }
   }
-  
+
   // Change signature: "int" -> "int_t"
-  static real_t local_frexp0(real_t x)
-  {
+  static real_t local_frexp0(real_t x) {
     int r;
     return vml_std::frexp(x, &r);
   }
-  static int_t local_frexp1(real_t x)
-  {
-    if (vml_std::isinf(x)) return std::numeric_limits<int_t>::max();
-    if (vml_std::isnan(x)) return std::numeric_limits<int_t>::min();
+  static int_t local_frexp1(real_t x) {
+    if (vml_std::isinf(x))
+      return std::numeric_limits<int_t>::max();
+    if (vml_std::isnan(x))
+      return std::numeric_limits<int_t>::min();
     int r;
     vml_std::frexp(x, &r);
     return r;
   }
-  static realvec_t local_vfrexp0(realvec_t x)
-  {
+  static realvec_t local_vfrexp0(realvec_t x) {
     intvec_t r;
     return vecmathlib::frexp(x, &r);
   }
-  static intvec_t local_vfrexp1(realvec_t x)
-  {
+  static intvec_t local_vfrexp1(realvec_t x) {
     intvec_t r;
     vecmathlib::frexp(x, &r);
     return r;
   }
-  static int_t local_ilogb(real_t x)
-  {
-    if (x==R(0.0)) return std::numeric_limits<int_t>::min();
-    if (vml_std::isinf(x)) return std::numeric_limits<int_t>::max();
-    if (vml_std::isnan(x)) return std::numeric_limits<int_t>::min();
+  static int_t local_ilogb(real_t x) {
+    if (x == R(0.0))
+      return std::numeric_limits<int_t>::min();
+    if (vml_std::isinf(x))
+      return std::numeric_limits<int_t>::max();
+    if (vml_std::isnan(x))
+      return std::numeric_limits<int_t>::min();
     return vml_std::ilogb(x);
   }
   static real_t local_ldexp(real_t x, int_t n) { return ldexp(x, n); }
-  static real_t local_mad(real_t x, real_t y, real_t z) { return x*y+z; }
-  static void test_fabs()
-  {
-    cout << "   testing + - + - * == != < <= > >= copysign fabs fdim fma fmax fmin frexp ilogb isfinite isinf isnan isnormal ldexp mad nextafter signbit...\n" << flush;
-    
+  static real_t local_mad(real_t x, real_t y, real_t z) { return x * y + z; }
+  static void test_fabs() {
+    cout << "   testing + - + - * == != < <= > >= copysign fabs fdim fma fmax "
+            "fmin frexp ilogb isfinite isinf isnan isnormal ldexp mad "
+            "nextafter signbit...\n"
+         << flush;
+
     const real_t eps = FP::epsilon();
     const real_t int_min = R(std::numeric_limits<int_t>::min());
     const real_t int_max = R(std::numeric_limits<int_t>::max());
     const real_t uint_min = R(std::numeric_limits<uint_t>::min());
     const real_t uint_max = R(std::numeric_limits<uint_t>::max());
     const real_t values[] = {
-      R(+0.0), R(+0.1), R(+0.9), R(+1.0), R(+1.1),
-      R(-0.0), R(-0.1), R(-0.9), R(-1.0), R(-1.1),
-      R(+0.0)+eps, R(+0.1)+eps, R(+0.9)+eps, R(+1.0)+eps, R(+1.1)+eps,
-      R(-0.0)+eps, R(-0.1)+eps, R(-0.9)+eps, R(-1.0)+eps, R(-1.1)+eps,
-      R(+0.0)-eps, R(+0.1)-eps, R(+0.9)-eps, R(+1.0)-eps, R(+1.1)-eps,
-      R(-0.0)-eps, R(-0.1)-eps, R(-0.9)-eps, R(-1.0)-eps, R(-1.1)-eps,
+        R(+0.0),
+        R(+0.1),
+        R(+0.9),
+        R(+1.0),
+        R(+1.1),
+        R(-0.0),
+        R(-0.1),
+        R(-0.9),
+        R(-1.0),
+        R(-1.1),
+        R(+0.0) + eps,
+        R(+0.1) + eps,
+        R(+0.9) + eps,
+        R(+1.0) + eps,
+        R(+1.1) + eps,
+        R(-0.0) + eps,
+        R(-0.1) + eps,
+        R(-0.9) + eps,
+        R(-1.0) + eps,
+        R(-1.1) + eps,
+        R(+0.0) - eps,
+        R(+0.1) - eps,
+        R(+0.9) - eps,
+        R(+1.0) - eps,
+        R(+1.1) - eps,
+        R(-0.0) - eps,
+        R(-0.1) - eps,
+        R(-0.9) - eps,
+        R(-1.0) - eps,
+        R(-1.1) - eps,
 #ifdef VML_HAVE_DENORMALS
-      +FP::min(), +FP::min()*(R(1.0)+eps), +FP::min()*R(2.0),
-      -FP::min(), -FP::min()*(R(1.0)+eps), -FP::min()*R(2.0),
+        +FP::min(),
+        +FP::min() * (R(1.0) + eps),
+        +FP::min() * R(2.0),
+        -FP::min(),
+        -FP::min() * (R(1.0) + eps),
+        -FP::min() * R(2.0),
 #endif
-      +FP::max(), +FP::max()*(R(1.0)-eps), +FP::max()*(R(1.0)-R(2.0)*eps),
-      -FP::max(), -FP::max()*(R(1.0)-eps), -FP::max()*(R(1.0)-R(2.0)*eps),
-      +R(0.5)*FP::max(), +R(0.5)*FP::max()*(R(1.0)+eps),
-      -R(0.5)*FP::max(), -R(0.5)*FP::max()*(R(1.0)+eps),
+        +FP::max(),
+        +FP::max() * (R(1.0) - eps),
+        +FP::max() * (R(1.0) - R(2.0) * eps),
+        -FP::max(),
+        -FP::max() * (R(1.0) - eps),
+        -FP::max() * (R(1.0) - R(2.0) * eps),
+        +R(0.5) * FP::max(),
+        +R(0.5) * FP::max() * (R(1.0) + eps),
+        -R(0.5) * FP::max(),
+        -R(0.5) * FP::max() * (R(1.0) + eps),
 #ifdef VML_HAVE_INF
-      +R(1.0/0.0),              // +FP::infinity()
-      -R(1.0/0.0),              // -FP::infinity()
+        +R(1.0 / 0.0), // +FP::infinity()
+        -R(1.0 / 0.0), // -FP::infinity()
 #endif
 #ifdef VML_HAVE_NAN
-      R(0.0/0.0),               // FP::quiet_NaN()
+        R(0.0 / 0.0), // FP::quiet_NaN()
 #endif
-      +int_min, +int_max, +uint_min, +uint_max,
-      -int_min, -int_max, -uint_min, -uint_max,
-      +int_min+R(0.1), +int_max+R(0.1), +uint_min+R(0.1), +uint_max+R(0.1),
-      -int_min+R(0.1), -int_max+R(0.1), -uint_min+R(0.1), -uint_max+R(0.1),
-      +int_min-R(0.1), +int_max-R(0.1), +uint_min-R(0.1), +uint_max-R(0.1),
-      -int_min-R(0.1), -int_max-R(0.1), -uint_min-R(0.1), -uint_max-R(0.1),
-      +int_min+R(1.0), +int_max+R(1.0), +uint_min+R(1.0), +uint_max+R(1.0),
-      -int_min+R(1.0), -int_max+R(1.0), -uint_min+R(1.0), -uint_max+R(1.0),
-      +int_min-R(1.0), +int_max-R(1.0), +uint_min-R(1.0), +uint_max-R(1.0),
-      -int_min-R(1.0), -int_max-R(1.0), -uint_min-R(1.0), -uint_max-R(1.0),
-      -R(443.9999425),
+        +int_min,
+        +int_max,
+        +uint_min,
+        +uint_max,
+        -int_min,
+        -int_max,
+        -uint_min,
+        -uint_max,
+        +int_min + R(0.1),
+        +int_max + R(0.1),
+        +uint_min + R(0.1),
+        +uint_max + R(0.1),
+        -int_min + R(0.1),
+        -int_max + R(0.1),
+        -uint_min + R(0.1),
+        -uint_max + R(0.1),
+        +int_min - R(0.1),
+        +int_max - R(0.1),
+        +uint_min - R(0.1),
+        +uint_max - R(0.1),
+        -int_min - R(0.1),
+        -int_max - R(0.1),
+        -uint_min - R(0.1),
+        -uint_max - R(0.1),
+        +int_min + R(1.0),
+        +int_max + R(1.0),
+        +uint_min + R(1.0),
+        +uint_max + R(1.0),
+        -int_min + R(1.0),
+        -int_max + R(1.0),
+        -uint_min + R(1.0),
+        -uint_max + R(1.0),
+        +int_min - R(1.0),
+        +int_max - R(1.0),
+        +uint_min - R(1.0),
+        +uint_max - R(1.0),
+        -int_min - R(1.0),
+        -int_max - R(1.0),
+        -uint_min - R(1.0),
+        -uint_max - R(1.0),
+        -R(443.9999425),
     };
     const int nvalues = sizeof values / sizeof *values;
-    
-    for (int i=0; i<8*nvalues+imax; ++i) {
-      const realvec_t x =
-        i<8*nvalues && i&1 ? RV(values[i/8]) : random(R(-10.0), R(+10.0));
-      const realvec_t y =
-        i<8*nvalues && i&2 ? RV(values[i/8]) : random(R(-10.0), R(+10.0));
-      const realvec_t z =
-        i<8*nvalues && i&4 ? RV(values[i/8]) : random(R(-10.0), R(+10.0));
+
+    for (int i = 0; i < 8 * nvalues + imax; ++i) {
+      const realvec_t x = i < 8 * nvalues && i & 1 ? RV(values[i / 8])
+                                                   : random(R(-10.0), R(+10.0));
+      const realvec_t y = i < 8 * nvalues && i & 2 ? RV(values[i / 8])
+                                                   : random(R(-10.0), R(+10.0));
+      const realvec_t z = i < 8 * nvalues && i & 4 ? RV(values[i / 8])
+                                                   : random(R(-10.0), R(+10.0));
       const intvec_t n = random(int_t(-10), int_t(+10));
-      
+
       check_real<RV>("+", local_pos, local_pos, x, R(0.0));
       check_real<RV>("-", local_neg, local_neg, x, R(0.0));
-      
-      check_real<RV,RV>("+", local_add, local_add, x, y, R(0.0));
-      check_real<RV,RV>("-", local_sub, local_sub, x, y, R(0.0));
-      check_real<RV,RV>("*", local_mul, local_mul, x, y, R(0.0));
-      
+
+      check_real<RV, RV>("+", local_add, local_add, x, y, R(0.0));
+      check_real<RV, RV>("-", local_sub, local_sub, x, y, R(0.0));
+      check_real<RV, RV>("*", local_mul, local_mul, x, y, R(0.0));
+
       {
         real_t rstd = x[0];
-        for (int i=1; i<realvec_t::size; ++i) {
+        for (int i = 1; i < realvec_t::size; ++i) {
           rstd += x[i];
         }
         real_t rvml = sum(x);
@@ -1238,7 +1190,7 @@ struct vecmathlib_test {
       }
       {
         real_t rstd = x[0];
-        for (int i=1; i<realvec_t::size; ++i) {
+        for (int i = 1; i < realvec_t::size; ++i) {
           rstd *= x[i];
         }
         real_t rvml = prod(x);
@@ -1246,7 +1198,7 @@ struct vecmathlib_test {
       }
       {
         real_t rstd = x[0];
-        for (int i=1; i<realvec_t::size; ++i) {
+        for (int i = 1; i < realvec_t::size; ++i) {
           rstd = vml_std::fmax(rstd, x[i]);
         }
         real_t rvml = vecmathlib::maxval(x);
@@ -1254,34 +1206,33 @@ struct vecmathlib_test {
       }
       {
         real_t rstd = x[0];
-        for (int i=1; i<realvec_t::size; ++i) {
+        for (int i = 1; i < realvec_t::size; ++i) {
           rstd = vml_std::fmin(rstd, x[i]);
         }
         real_t rvml = vecmathlib::minval(x);
         check_real("minval", rstd, rvml, x, R(0.0));
       }
-      
-      check_bool<RV,RV>("==", local_eq, local_veq, x, y);
-      check_bool<RV,RV>("!=", local_ne, local_vne, x, y);
-      check_bool<RV,RV>("<", local_lt, local_vlt, x, y);
-      check_bool<RV,RV>("<=", local_le, local_vle, x, y);
-      check_bool<RV,RV>(">", local_gt, local_vgt, x, y);
-      check_bool<RV,RV>(">=", local_ge, local_vge, x, y);
-      
-      check_real<RV,RV>("copysign",
-                        vml_std::copysign, vecmathlib::copysign, x, y, 0.0);
+
+      check_bool<RV, RV>("==", local_eq, local_veq, x, y);
+      check_bool<RV, RV>("!=", local_ne, local_vne, x, y);
+      check_bool<RV, RV>("<", local_lt, local_vlt, x, y);
+      check_bool<RV, RV>("<=", local_le, local_vle, x, y);
+      check_bool<RV, RV>(">", local_gt, local_vgt, x, y);
+      check_bool<RV, RV>(">=", local_ge, local_vge, x, y);
+
+      check_real<RV, RV>("copysign", vml_std::copysign, vecmathlib::copysign, x,
+                         y, 0.0);
       check_real<RV>("fabs", vml_std::fabs, vecmathlib::fabs, x, 0.0);
-      check_real<RV,RV>("fdim",
-                        vml_std::fdim, vecmathlib::fdim, x, y, accuracy());
-      check_real<RV,RV,RV>("fma",
-                           vml_std::fma, vecmathlib::fma,
-                           x, y, z, R(10.0)*accuracy());
-      check_real<RV,RV>("fmax", vml_std::fmax, vecmathlib::fmax, x, y, 0.0);
-      check_real<RV,RV>("fmin", vml_std::fmin, vecmathlib::fmin, x, y, 0.0);
+      check_real<RV, RV>("fdim", vml_std::fdim, vecmathlib::fdim, x, y,
+                         accuracy());
+      check_real<RV, RV, RV>("fma", vml_std::fma, vecmathlib::fma, x, y, z,
+                             R(10.0) * accuracy());
+      check_real<RV, RV>("fmax", vml_std::fmax, vecmathlib::fmax, x, y, 0.0);
+      check_real<RV, RV>("fmin", vml_std::fmin, vecmathlib::fmin, x, y, 0.0);
       check_real<RV>("frexp0", local_frexp0, local_vfrexp0, x, 0.0);
       check_int<RV>("frexp1", local_frexp1, local_vfrexp1, x);
-      check_int<RV>("ilogb",
-                    local_ilogb, (intvec_t(*)(realvec_t))vecmathlib::ilogb, x);
+      check_int<RV>("ilogb", local_ilogb,
+                    (intvec_t (*)(realvec_t))vecmathlib::ilogb, x);
 #if defined VML_HAVE_INF || defined VML_HAVE_NAN
       check_bool<RV>("isfinite", vml_std::isfinite, vecmathlib::isfinite, x);
 #endif
@@ -1294,91 +1245,162 @@ struct vecmathlib_test {
 #ifdef VML_HAVE_DENORMALS
       check_bool<RV>("isnormal", vml_std::isnormal, vecmathlib::isnormal, x);
 #endif
-      check_real<RV,I>("ldexp", local_ldexp, vecmathlib::ldexp, x, n[0], 0.0);
-      check_real<RV,IV>("ldexp", local_ldexp, vecmathlib::ldexp, x, n, 0.0);
-      check_real<RV,RV,RV>("mad",
-                           local_mad, vecmathlib::mad,
-                           x, y, z, R(10.0)*accuracy());
-      check_real<RV,RV>("nextafter",
-                        vml_std::nextafter, vecmathlib::nextafter, x, y, 0.0);
+      check_real<RV, I>("ldexp", local_ldexp, vecmathlib::ldexp, x, n[0], 0.0);
+      check_real<RV, IV>("ldexp", local_ldexp, vecmathlib::ldexp, x, n, 0.0);
+      check_real<RV, RV, RV>("mad", local_mad, vecmathlib::mad, x, y, z,
+                             R(10.0) * accuracy());
+      check_real<RV, RV>("nextafter", vml_std::nextafter, vecmathlib::nextafter,
+                         x, y, 0.0);
       check_bool<RV>("signbit", vml_std::signbit, vecmathlib::signbit, x);
     }
   }
-  
-  static void test_convert()
-  {
-    cout << "   testing ceil convert_float convert_int floor rint round trunc...\n"
+
+  static void test_convert() {
+    cout << "   testing ceil convert_float convert_int floor rint round "
+            "trunc...\n"
          << flush;
-    
+
     const real_t eps = FP::epsilon();
     const real_t int_min = R(std::numeric_limits<int_t>::min());
     const real_t int_max = R(std::numeric_limits<int_t>::max());
     const real_t uint_min = R(std::numeric_limits<uint_t>::min());
     const real_t uint_max = R(std::numeric_limits<uint_t>::max());
-    const real_t mantissa_max = (U(1) << (FP::mantissa_bits+1)) - U(1);
-    const real_t real_max =
-      (((U(1) << (FP::mantissa_bits+1)) - U(1)) << (FP::exponent_bits-1)) +
-      (U(1) << (FP::exponent_bits-1)) - U(1);
+    const real_t mantissa_max = (U(1) << (FP::mantissa_bits + 1)) - U(1);
+    const real_t real_max = (((U(1) << (FP::mantissa_bits + 1)) - U(1))
+                             << (FP::exponent_bits - 1)) +
+                            (U(1) << (FP::exponent_bits - 1)) - U(1);
     const real_t values[] = {
-      R(+0.0), R(+0.1), R(+0.9), R(+1.0), R(+1.1),
-      R(-0.0), R(-0.1), R(-0.9), R(-1.0), R(-1.1),
-      R(+0.0)+eps, R(+0.1)+eps, R(+0.9)+eps, R(+1.0)+eps, R(+1.1)+eps,
-      R(-0.0)+eps, R(-0.1)+eps, R(-0.9)+eps, R(-1.0)+eps, R(-1.1)+eps,
-      R(+0.0)-eps, R(+0.1)-eps, R(+0.9)-eps, R(+1.0)-eps, R(+1.1)-eps,
-      R(-0.0)-eps, R(-0.1)-eps, R(-0.9)-eps, R(-1.0)-eps, R(-1.1)-eps,
+        R(+0.0),
+        R(+0.1),
+        R(+0.9),
+        R(+1.0),
+        R(+1.1),
+        R(-0.0),
+        R(-0.1),
+        R(-0.9),
+        R(-1.0),
+        R(-1.1),
+        R(+0.0) + eps,
+        R(+0.1) + eps,
+        R(+0.9) + eps,
+        R(+1.0) + eps,
+        R(+1.1) + eps,
+        R(-0.0) + eps,
+        R(-0.1) + eps,
+        R(-0.9) + eps,
+        R(-1.0) + eps,
+        R(-1.1) + eps,
+        R(+0.0) - eps,
+        R(+0.1) - eps,
+        R(+0.9) - eps,
+        R(+1.0) - eps,
+        R(+1.1) - eps,
+        R(-0.0) - eps,
+        R(-0.1) - eps,
+        R(-0.9) - eps,
+        R(-1.0) - eps,
+        R(-1.1) - eps,
 #ifdef VML_HAVE_DENORMALS
-      +FP::min(), +FP::min()*(R(1.0)+eps), +FP::min()*R(2.0),
-      -FP::min(), -FP::min()*(R(1.0)+eps), -FP::min()*R(2.0),
+        +FP::min(),
+        +FP::min() * (R(1.0) + eps),
+        +FP::min() * R(2.0),
+        -FP::min(),
+        -FP::min() * (R(1.0) + eps),
+        -FP::min() * R(2.0),
 #endif
-      +FP::max(), +FP::max()*(R(1.0)-eps), +FP::max()*(R(1.0)-R(2.0)*eps),
-      -FP::max(), -FP::max()*(R(1.0)-eps), -FP::max()*(R(1.0)-R(2.0)*eps),
-      +R(0.5)*FP::max(), +R(0.5)*FP::max()*(R(1.0)+eps),
-      -R(0.5)*FP::max(), -R(0.5)*FP::max()*(R(1.0)+eps),
+        +FP::max(),
+        +FP::max() * (R(1.0) - eps),
+        +FP::max() * (R(1.0) - R(2.0) * eps),
+        -FP::max(),
+        -FP::max() * (R(1.0) - eps),
+        -FP::max() * (R(1.0) - R(2.0) * eps),
+        +R(0.5) * FP::max(),
+        +R(0.5) * FP::max() * (R(1.0) + eps),
+        -R(0.5) * FP::max(),
+        -R(0.5) * FP::max() * (R(1.0) + eps),
 #ifdef VML_HAVE_INF
-      +R(1.0/0.0),              // +FP::infinity()
-      -R(1.0/0.0),              // -FP::infinity()
+        +R(1.0 / 0.0), // +FP::infinity()
+        -R(1.0 / 0.0), // -FP::infinity()
 #endif
 #ifdef VML_HAVE_NAN
-      R(0.0/0.0),               // FP::quiet_NaN()
+        R(0.0 / 0.0), // FP::quiet_NaN()
 #endif
-      +int_min, +int_max, +uint_min, +uint_max,
-      -int_min, -int_max, -uint_min, -uint_max,
-      +int_min+R(0.1), +int_max+R(0.1), +uint_min+R(0.1), +uint_max+R(0.1),
-      -int_min+R(0.1), -int_max+R(0.1), -uint_min+R(0.1), -uint_max+R(0.1),
-      +int_min-R(0.1), +int_max-R(0.1), +uint_min-R(0.1), +uint_max-R(0.1),
-      -int_min-R(0.1), -int_max-R(0.1), -uint_min-R(0.1), -uint_max-R(0.1),
-      +int_min+R(1.0), +int_max+R(1.0), +uint_min+R(1.0), +uint_max+R(1.0),
-      -int_min+R(1.0), -int_max+R(1.0), -uint_min+R(1.0), -uint_max+R(1.0),
-      +int_min-R(1.0), +int_max-R(1.0), +uint_min-R(1.0), +uint_max-R(1.0),
-      -int_min-R(1.0), -int_max-R(1.0), -uint_min-R(1.0), -uint_max-R(1.0),
-      +mantissa_max, +mantissa_max-R(1.0), +mantissa_max+R(1.0),
-      -mantissa_max, -mantissa_max-R(1.0), -mantissa_max+R(1.0),
-      +real_max, +real_max-R(1.0), +real_max+R(1.0),
-      -real_max, -real_max-R(1.0), -real_max+R(1.0),
-      -R(443.9999425),
+        +int_min,
+        +int_max,
+        +uint_min,
+        +uint_max,
+        -int_min,
+        -int_max,
+        -uint_min,
+        -uint_max,
+        +int_min + R(0.1),
+        +int_max + R(0.1),
+        +uint_min + R(0.1),
+        +uint_max + R(0.1),
+        -int_min + R(0.1),
+        -int_max + R(0.1),
+        -uint_min + R(0.1),
+        -uint_max + R(0.1),
+        +int_min - R(0.1),
+        +int_max - R(0.1),
+        +uint_min - R(0.1),
+        +uint_max - R(0.1),
+        -int_min - R(0.1),
+        -int_max - R(0.1),
+        -uint_min - R(0.1),
+        -uint_max - R(0.1),
+        +int_min + R(1.0),
+        +int_max + R(1.0),
+        +uint_min + R(1.0),
+        +uint_max + R(1.0),
+        -int_min + R(1.0),
+        -int_max + R(1.0),
+        -uint_min + R(1.0),
+        -uint_max + R(1.0),
+        +int_min - R(1.0),
+        +int_max - R(1.0),
+        +uint_min - R(1.0),
+        +uint_max - R(1.0),
+        -int_min - R(1.0),
+        -int_max - R(1.0),
+        -uint_min - R(1.0),
+        -uint_max - R(1.0),
+        +mantissa_max,
+        +mantissa_max - R(1.0),
+        +mantissa_max + R(1.0),
+        -mantissa_max,
+        -mantissa_max - R(1.0),
+        -mantissa_max + R(1.0),
+        +real_max,
+        +real_max - R(1.0),
+        +real_max + R(1.0),
+        -real_max,
+        -real_max - R(1.0),
+        -real_max + R(1.0),
+        -R(443.9999425),
     };
     const int nvalues = sizeof values / sizeof *values;
-    
-    for (int i=0; i<nvalues+imax; ++i) {
+
+    for (int i = 0; i < nvalues + imax; ++i) {
       const realvec_t x =
-        i<nvalues ? RV(values[i]) : random(R(-1.0e+10), R(+1.0e+10));
+          i < nvalues ? RV(values[i]) : random(R(-1.0e+10), R(+1.0e+10));
       const intvec_t n1 = random(int_t(-100), int_t(+100));
-      //const intvec_t n2 = random(int_t(-1000000000), int_t(+1000000000));
+      // const intvec_t n2 = random(int_t(-1000000000), int_t(+1000000000));
       const intvec_t n2 =
-        random(std::numeric_limits<int_t>::min() / 2, // avoid overflow
-               std::numeric_limits<int_t>::max() / 2);
+          random(std::numeric_limits<int_t>::min() / 2, // avoid overflow
+                 std::numeric_limits<int_t>::max() / 2);
       const realvec_t fn1 = vecmathlib::convert_float(n1);
       const realvec_t fn2 = vecmathlib::convert_float(n2);
       const realvec_t fn1h = vecmathlib::convert_float(n1) * RV(0.25);
       const realvec_t fn2h = vecmathlib::convert_float(n2) * RV(0.25);
-      check_real<IV>("convert_float",
-                     FP::convert_float, vecmathlib::convert_float, n1, R(0.0));
-      check_real<IV>("convert_float",
-                     FP::convert_float, vecmathlib::convert_float, n2, R(0.0));
+      check_real<IV>("convert_float", FP::convert_float,
+                     vecmathlib::convert_float, n1, R(0.0));
+      check_real<IV>("convert_float", FP::convert_float,
+                     vecmathlib::convert_float, n2, R(0.0));
       // Note: RV(int_max) > int_max due to rounding
       if (all(x >= RV(int_min) && x < RV(int_max))) {
-        check_int<RV>("convert_int",
-                      FP::convert_int, vecmathlib::convert_int, x);
+        check_int<RV>("convert_int", FP::convert_int, vecmathlib::convert_int,
+                      x);
       }
       // TODO: These should all have accuracy R(0.0) instead!
       check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, x, accuracy());
@@ -1387,218 +1409,213 @@ struct vecmathlib_test {
       check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, fn1h, accuracy());
       check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, fn2h, accuracy());
       check_real<RV>("floor", vml_std::floor, vecmathlib::floor, x, accuracy());
-      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1, accuracy());
-      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2, accuracy());
-      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1h, accuracy());
-      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2h, accuracy());
-      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, x, accuracy());
-      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1, accuracy());
-      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2, accuracy());
-      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1h, accuracy());
-      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2h, accuracy());
+      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1,
+                     accuracy());
+      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2,
+                     accuracy());
+      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1h,
+                     accuracy());
+      check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2h,
+                     accuracy());
+      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, x,
+      // accuracy());
+      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1,
+      // accuracy());
+      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2,
+      // accuracy());
+      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1h,
+      // accuracy());
+      // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2h,
+      // accuracy());
       check_real<RV>("rint", vml_std::rint, vecmathlib::rint, x, accuracy());
       check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn1, accuracy());
       check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn2, accuracy());
       check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn1h, accuracy());
       check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn2h, accuracy());
       check_real<RV>("round", vml_std::round, vecmathlib::round, x, accuracy());
-      check_real<RV>("round", vml_std::round, vecmathlib::round, fn1, accuracy());
-      check_real<RV>("round", vml_std::round, vecmathlib::round, fn2, accuracy());
-      check_real<RV>("round", vml_std::round, vecmathlib::round, fn1h, accuracy());
-      check_real<RV>("round", vml_std::round, vecmathlib::round, fn2h, accuracy());
+      check_real<RV>("round", vml_std::round, vecmathlib::round, fn1,
+                     accuracy());
+      check_real<RV>("round", vml_std::round, vecmathlib::round, fn2,
+                     accuracy());
+      check_real<RV>("round", vml_std::round, vecmathlib::round, fn1h,
+                     accuracy());
+      check_real<RV>("round", vml_std::round, vecmathlib::round, fn2h,
+                     accuracy());
       check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, x, accuracy());
-      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1, accuracy());
-      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2, accuracy());
-      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1h, accuracy());
-      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2h, accuracy());
+      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1,
+                     accuracy());
+      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2,
+                     accuracy());
+      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1h,
+                     accuracy());
+      check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2h,
+                     accuracy());
     }
   }
-  
-  
-  
-  static void test_asin()
-  {
+
+  static void test_asin() {
     cout << "   testing asin acos atan atan2...\n" << flush;
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x = random(R(-1.0), R(+1.0));
       check_real<RV>("asin", vml_std::asin, vecmathlib::asin, x, accuracy(4));
       check_real<RV>("acos", vml_std::acos, vecmathlib::acos, x, accuracy(4));
     }
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x = random(R(-100.0), R(+100.0));
       const realvec_t y = random(R(-100.0), R(+100.0));
       check_real<RV>("atan", vml_std::atan, vecmathlib::atan, x, accuracy(5));
-      check_real<RV,RV>("atan2",
-                        vml_std::atan2, vecmathlib::atan2, x, y, accuracy(6));
+      check_real<RV, RV>("atan2", vml_std::atan2, vecmathlib::atan2, x, y,
+                         accuracy(6));
     }
   }
-  
-  static void test_asinh()
-  {
+
+  static void test_asinh() {
     cout << "   testing asinh acosh atanh...\n" << flush;
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x = random(R(-1000.0), R(+1000.0));
-      check_real<RV>("asinh",
-                     vml_std::asinh, vecmathlib::asinh, x, accuracy(4));
+      check_real<RV>("asinh", vml_std::asinh, vecmathlib::asinh, x,
+                     accuracy(4));
     }
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x = random(R(1.0), R(1000.0));
-      check_real<RV>("acosh",
-                     vml_std::acosh, vecmathlib::acosh, x, accuracy(4));
+      check_real<RV>("acosh", vml_std::acosh, vecmathlib::acosh, x,
+                     accuracy(4));
     }
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x = random(R(-1.0), R(+1.0));
-      check_real<RV>("atanh",
-                     vml_std::atanh, vecmathlib::atanh, x, accuracy(5));
+      check_real<RV>("atanh", vml_std::atanh, vecmathlib::atanh, x,
+                     accuracy(5));
     }
   }
-  
+
   static real_t local_exp10(real_t x) { return pow(R(10.0), x); }
-  static void test_exp()
-  {
+  static void test_exp() {
     cout << "   testing exp exp10 exp2 expm1...\n" << flush;
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x = random(R(-100.0), R(+100.0));
       check_real<RV>("exp", vml_std::exp, vecmathlib::exp, x, accuracy(3));
       check_real<RV>("exp10", local_exp10, vecmathlib::exp10, x, accuracy(3));
       check_real<RV>("exp2", vml_std::exp2, vecmathlib::exp2, x, accuracy(3));
-      check_real<RV>("expm1",
-                     vml_std::expm1, vecmathlib::expm1, x, accuracy(3));
+      check_real<RV>("expm1", vml_std::expm1, vecmathlib::expm1, x,
+                     accuracy(3));
     }
   }
-  
-  static void test_log()
-  {
+
+  static void test_log() {
     cout << "   testing log log10 log1p log2...\n" << flush;
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x = random(R(1.0e-10), R(1.0e+10));
       check_real<RV>("log", vml_std::log, vecmathlib::log, x, accuracy(3));
-      check_real<RV>("log10",
-                     vml_std::log10, vecmathlib::log10, x, accuracy(3));
-      check_real<RV>("log1p",
-                     vml_std::log1p, vecmathlib::log1p, x, accuracy(2));
+      check_real<RV>("log10", vml_std::log10, vecmathlib::log10, x,
+                     accuracy(3));
+      check_real<RV>("log1p", vml_std::log1p, vecmathlib::log1p, x,
+                     accuracy(2));
       check_real<RV>("log2", vml_std::log2, vecmathlib::log2, x, accuracy(3));
     }
   }
-  
-  static void test_pow()
-  {
+
+  static void test_pow() {
     cout << "   testing pow...\n" << flush;
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x = random(R(0.001), R(1000.0));
       const realvec_t y = random(R(-10.0), R(+10.0));
       const realvec_t ya = fabs(y);
       const intvec_t n = random(I(-10), I(+10));
       const realvec_t fn = vecmathlib::convert_float(n);
-      check_real<RV,RV>("pow(0,y)",
-                        vml_std::pow, vecmathlib::pow, RV(0.0), ya,
-                        accuracy(16));
-      check_real<RV,RV>("pow(x,0)",
-                        vml_std::pow, vecmathlib::pow, x, RV(0.0),
-                        accuracy(16));
+      check_real<RV, RV>("pow(0,y)", vml_std::pow, vecmathlib::pow, RV(0.0), ya,
+                         accuracy(16));
+      check_real<RV, RV>("pow(x,0)", vml_std::pow, vecmathlib::pow, x, RV(0.0),
+                         accuracy(16));
       // just to check
       check_real<RV>("log(x)", vml_std::log, vecmathlib::log, x, accuracy(3));
-      check_real<RV,RV>("pow(x,y)",
-                        vml_std::pow, vecmathlib::pow, x, y, accuracy(16));
-      check_real<RV,RV>("pow(-x,n)",
-                        vml_std::pow, vecmathlib::pow, -x, fn, accuracy(16));
+      check_real<RV, RV>("pow(x,y)", vml_std::pow, vecmathlib::pow, x, y,
+                         accuracy(16));
+      check_real<RV, RV>("pow(-x,n)", vml_std::pow, vecmathlib::pow, -x, fn,
+                         accuracy(16));
     }
   }
-  
-  static real_t local_rcp(real_t x) { return R(1.0)/x; }
-  static void test_rcp()
-  {
+
+  static real_t local_rcp(real_t x) { return R(1.0) / x; }
+  static void test_rcp() {
     cout << "   testing / fmod rcp remainder...\n" << flush;
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x = random(R(-100.0), R(+100.0));
       const realvec_t y = random(R(-100.0), R(+100.0));
       const intvec_t n = random(I(-100), I(+100));
       const intvec_t m = random(I(-100), I(+100));
       const realvec_t fn = vecmathlib::convert_float(n);
-      const realvec_t fm = vecmathlib::convert_float
-        (m + vecmathlib::convert_int(m == intvec_t(I(0))));
-      check_real<RV,RV>("/", local_div, local_div, x, y, accuracy());
+      const realvec_t fm = vecmathlib::convert_float(
+          m + vecmathlib::convert_int(m == intvec_t(I(0))));
+      check_real<RV, RV>("/", local_div, local_div, x, y, accuracy());
       check_real<RV>("rcp", local_rcp, vecmathlib::rcp, x, accuracy());
-      check_real<RV,RV>("fmod(x,y)",
-                        vml_std::fmod, vecmathlib::fmod, x, y,
-                        2.0*accuracy(), y);
-      check_real<RV,RV>("fmod(x,m)",
-                        vml_std::fmod, vecmathlib::fmod, x, fm,
-                        2.0*accuracy(), fm);
-      check_real<RV,RV>("fmod(n,y)",
-                        vml_std::fmod, vecmathlib::fmod, fn, y,
-                        2.0*accuracy(), y);
-      check_real<RV,RV>("remainder(x,y)",
-                        vml_std::remainder, vecmathlib::remainder,
-                        x, y, R(2.0)*accuracy(), y);
-      check_real<RV,RV>("remainder(x,m)",
-                        vml_std::remainder, vecmathlib::remainder,
-                        x, fm, R(2.0)*accuracy(), fm);
-      check_real<RV,RV>("remainder(n,y)",
-                        vml_std::remainder, vecmathlib::remainder,
-                        fn, y, R(2.0)*accuracy(), y);
+      check_real<RV, RV>("fmod(x,y)", vml_std::fmod, vecmathlib::fmod, x, y,
+                         2.0 * accuracy(), y);
+      check_real<RV, RV>("fmod(x,m)", vml_std::fmod, vecmathlib::fmod, x, fm,
+                         2.0 * accuracy(), fm);
+      check_real<RV, RV>("fmod(n,y)", vml_std::fmod, vecmathlib::fmod, fn, y,
+                         2.0 * accuracy(), y);
+      check_real<RV, RV>("remainder(x,y)", vml_std::remainder,
+                         vecmathlib::remainder, x, y, R(2.0) * accuracy(), y);
+      check_real<RV, RV>("remainder(x,m)", vml_std::remainder,
+                         vecmathlib::remainder, x, fm, R(2.0) * accuracy(), fm);
+      check_real<RV, RV>("remainder(n,y)", vml_std::remainder,
+                         vecmathlib::remainder, fn, y, R(2.0) * accuracy(), y);
     }
   }
-  
-  static void test_sin()
-  {
+
+  static void test_sin() {
     cout << "   testing cos sin tan...\n" << flush;
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x = random(R(-10.0), R(+10.0));
       check_real<RV>("sin", vml_std::sin, vecmathlib::sin, x, accuracy(4));
       check_real<RV>("cos", vml_std::cos, vecmathlib::cos, x, accuracy(4));
     }
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x0 = random(R(-1.55), R(+1.55));
       const intvec_t n = random(I(-10), I(+10));
       const realvec_t x = x0 + vecmathlib::convert_float(n) * RV(M_PI);
       // tan loses accuracy near pi/2
       // (by definition, not by implementation?)
-      check_real<RV>("tan",
-                     vml_std::tan, vecmathlib::tan, x, R(20.0)*accuracy(5));
+      check_real<RV>("tan", vml_std::tan, vecmathlib::tan, x,
+                     R(20.0) * accuracy(5));
     }
   }
-  
-  static void test_sinh()
-  {
+
+  static void test_sinh() {
     cout << "   testing cosh sinh tanh...\n" << flush;
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x = random(R(-10.0), R(+10.0));
       check_real<RV>("sinh", vml_std::sinh, vecmathlib::sinh, x, accuracy(4));
       check_real<RV>("cosh", vml_std::cosh, vecmathlib::cosh, x, accuracy(4));
       check_real<RV>("tanh", vml_std::tanh, vecmathlib::tanh, x, accuracy(5));
     }
   }
-  
-  static real_t local_rsqrt(real_t x) { return R(1.0)/sqrt(x); }
-  static void test_sqrt()
-  {
+
+  static real_t local_rsqrt(real_t x) { return R(1.0) / sqrt(x); }
+  static void test_sqrt() {
     cout << "   testing cbrt hypot rsqrt sqrt...\n" << flush;
-    for (int i=0; i<imax; ++i) {
+    for (int i = 0; i < imax; ++i) {
       const realvec_t x = random(R(1.0e-3), R(1.0e+3));
       const realvec_t y = random(-R(1.0e+3), R(1.0e+3));
       const realvec_t z = random(-R(1.0e+3), R(1.0e+3));
       check_real<RV>("cbrt", vml_std::cbrt, vecmathlib::cbrt, x, accuracy());
-      check_real<RV,RV>("hypot",
-                        vml_std::hypot, vecmathlib::hypot, y, z, accuracy());
+      check_real<RV, RV>("hypot", vml_std::hypot, vecmathlib::hypot, y, z,
+                         accuracy());
       check_real<RV>("rsqrt", local_rsqrt, vecmathlib::rsqrt, x, accuracy());
       check_real<RV>("sqrt", vml_std::sqrt, vecmathlib::sqrt, x, accuracy());
     }
   }
-  
-  
-  
-  static void test()
-  {
+
+  static void test() {
     cout << "\n"
          << "Testing math functions for type " << realvec_t::name() << ":\n";
-    
+
     test_bool();
     test_int();
     test_real();
-    
+
     test_mem();
-    
+
     // Test "basic" functions first
     test_abs();
     test_fabs();
@@ -1615,90 +1632,86 @@ struct vecmathlib_test {
   }
 };
 
-
-
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
   using namespace vecmathlib;
 
   cout << "Testing math functions:\n"
-       << "[" VECMATHLIB_CONFIGURATION "]\n"
-       << flush;
-  
-  vecmathlib_test<realpseudovec<float,1> >::test();
+       << "[" VECMATHLIB_CONFIGURATION "]\n" << flush;
+
+  vecmathlib_test<realpseudovec<float, 1>>::test();
 #ifdef __clang__
-  vecmathlib_test<realbuiltinvec<float,1> >::test();
+  vecmathlib_test<realbuiltinvec<float, 1>>::test();
 #endif
-  vecmathlib_test<realtestvec<float,1> >::test();
+  vecmathlib_test<realtestvec<float, 1>>::test();
 #ifdef VECMATHLIB_HAVE_VEC_FLOAT_1
-  vecmathlib_test<realvec<float,1> >::test();
+  vecmathlib_test<realvec<float, 1>>::test();
 #endif
-  vecmathlib_test<realpseudovec<float,2> >::test();
+  vecmathlib_test<realpseudovec<float, 2>>::test();
 #ifdef __clang__
-  vecmathlib_test<realbuiltinvec<float,2> >::test();
+  vecmathlib_test<realbuiltinvec<float, 2>>::test();
 #endif
-  vecmathlib_test<realtestvec<float,2> >::test();
+  vecmathlib_test<realtestvec<float, 2>>::test();
 #ifdef VECMATHLIB_HAVE_VEC_FLOAT_2
-  vecmathlib_test<realvec<float,2> >::test();
+  vecmathlib_test<realvec<float, 2>>::test();
 #endif
-  vecmathlib_test<realpseudovec<float,4> >::test();
+  vecmathlib_test<realpseudovec<float, 4>>::test();
 #ifdef __clang__
-  vecmathlib_test<realbuiltinvec<float,4> >::test();
+  vecmathlib_test<realbuiltinvec<float, 4>>::test();
 #endif
-  vecmathlib_test<realtestvec<float,4> >::test();
+  vecmathlib_test<realtestvec<float, 4>>::test();
 #ifdef VECMATHLIB_HAVE_VEC_FLOAT_4
-  vecmathlib_test<realvec<float,4> >::test();
+  vecmathlib_test<realvec<float, 4>>::test();
 #endif
 #ifdef VECMATHLIB_HAVE_VEC_FLOAT_8
-  vecmathlib_test<realpseudovec<float,8> >::test();
+  vecmathlib_test<realpseudovec<float, 8>>::test();
 #ifdef __clang__
-  vecmathlib_test<realbuiltinvec<float,8> >::test();
+  vecmathlib_test<realbuiltinvec<float, 8>>::test();
 #endif
-  vecmathlib_test<realtestvec<float,8> >::test();
-  vecmathlib_test<realvec<float,8> >::test();
+  vecmathlib_test<realtestvec<float, 8>>::test();
+  vecmathlib_test<realvec<float, 8>>::test();
 #endif
 #ifdef VECMATHLIB_HAVE_VEC_FLOAT_16
-  vecmathlib_test<realpseudovec<float,16> >::test();
+  vecmathlib_test<realpseudovec<float, 16>>::test();
 #ifdef __clang__
-  vecmathlib_test<realbuiltinvec<float,16> >::test();
+  vecmathlib_test<realbuiltinvec<float, 16>>::test();
 #endif
-  vecmathlib_test<realtestvec<float,16> >::test();
-  vecmathlib_test<realvec<float,16> >::test();
+  vecmathlib_test<realtestvec<float, 16>>::test();
+  vecmathlib_test<realvec<float, 16>>::test();
 #endif
-  
-  vecmathlib_test<realpseudovec<double,1> >::test();
+
+  vecmathlib_test<realpseudovec<double, 1>>::test();
 #ifdef __clang__
-  vecmathlib_test<realbuiltinvec<double,1> >::test();
+  vecmathlib_test<realbuiltinvec<double, 1>>::test();
 #endif
-  vecmathlib_test<realtestvec<double,1> >::test();
+  vecmathlib_test<realtestvec<double, 1>>::test();
 #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_1
-  vecmathlib_test<realvec<double,1> >::test();
+  vecmathlib_test<realvec<double, 1>>::test();
 #endif
-  vecmathlib_test<realpseudovec<double,2> >::test();
+  vecmathlib_test<realpseudovec<double, 2>>::test();
 #ifdef __clang__
-  vecmathlib_test<realbuiltinvec<double,2> >::test();
+  vecmathlib_test<realbuiltinvec<double, 2>>::test();
 #endif
-  vecmathlib_test<realtestvec<double,2> >::test();
+  vecmathlib_test<realtestvec<double, 2>>::test();
 #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_2
-  vecmathlib_test<realvec<double,2> >::test();
+  vecmathlib_test<realvec<double, 2>>::test();
 #endif
 #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_4
-  vecmathlib_test<realpseudovec<double,4> >::test();
+  vecmathlib_test<realpseudovec<double, 4>>::test();
 #ifdef __clang__
-  vecmathlib_test<realbuiltinvec<double,4> >::test();
+  vecmathlib_test<realbuiltinvec<double, 4>>::test();
 #endif
-  vecmathlib_test<realtestvec<double,4> >::test();
-  vecmathlib_test<realvec<double,4> >::test();
+  vecmathlib_test<realtestvec<double, 4>>::test();
+  vecmathlib_test<realvec<double, 4>>::test();
 #endif
 #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_8
-  vecmathlib_test<realpseudovec<double,8> >::test();
+  vecmathlib_test<realpseudovec<double, 8>>::test();
 #ifdef __clang__
-  vecmathlib_test<realbuiltinvec<double,8> >::test();
+  vecmathlib_test<realbuiltinvec<double, 8>>::test();
 #endif
-  vecmathlib_test<realtestvec<double,8> >::test();
-  vecmathlib_test<realvec<double,8> >::test();
+  vecmathlib_test<realtestvec<double, 8>>::test();
+  vecmathlib_test<realvec<double, 8>>::test();
 #endif
-  
+
   cout << "\n";
   if (num_errors == 0) {
     cout << "SUCCESS";
@@ -1706,6 +1719,6 @@ int main(int argc, char** argv)
     cout << "FAILURE";
   }
   cout << ": " << num_errors << " errors found\n" << flush;
-  
+
   return num_errors == 0 ? 0 : 1;
 }
diff --git a/vec_altivec_float4.h b/vec_altivec_float4.h
index 14e0308..55530b4 100644
--- a/vec_altivec_float4.h
+++ b/vec_altivec_float4.h
@@ -13,647 +13,566 @@
 #include <altivec.h>
 
 #if defined __clang__
-#  define __vector vector
-#  define __pixel pixel
-#  define __bool bool
+#define __vector vector
+#define __pixel pixel
+#define __bool bool
 #elif defined __gcc__
-#  undef vector
-#  undef pixel
-#  undef bool
+#undef vector
+#undef pixel
+#undef bool
 #elif defined __xlC__
-#  define __bool bool
+#define __bool bool
 #else
-#  error "Unknown compiler"
+#error "Unknown compiler"
 #endif
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FLOAT_4
-  template<> struct boolvec<float,4>;
-  template<> struct intvec<float,4>;
-  template<> struct realvec<float,4>;
-  
-  
-  
-  template<>
-  struct boolvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef __vector __bool int bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values are -1, false values are 0
-    static uint_t from_bool(bool a) { return -int_t(a); }
-    static bool to_bool(uint_t a) { return a; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v((bvector_t)vec_splats(from_bool(a))) {}
-    boolvec(bool const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vec_nor(v, v); }
-    
-    boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
-    boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
-    // boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator==(boolvec x) const; // defined after intvec
-    boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
-    
-    bool all() const { return vec_all_ne(v, BV(false).v); }
-    bool any() const { return vec_any_ne(v, BV(false).v); }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef __vector signed int ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vec_splats(a)) {}
-    intvec(int_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota() { return (__vector signed int){0, 1, 2, 3}; }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return (__vector __bool int)v; }
-    boolvec_t convert_bool() const { return *this != IV(0); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const
-    {
+template <> struct boolvec<float, 4>;
+template <> struct intvec<float, 4>;
+template <> struct realvec<float, 4>;
+
+template <> struct boolvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef bool scalar_t;
+  typedef __vector __bool int bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values are -1, false values are 0
+  static uint_t from_bool(bool a) { return -int_t(a); }
+  static bool to_bool(uint_t a) { return a; }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v((bvector_t)vec_splats(from_bool(a))) {}
+  boolvec(bool const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return vec_nor(v, v); }
+
+  boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
+  boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
+  // boolvec operator==(boolvec x) const { return !(*this!=x); }
+  boolvec operator==(boolvec x) const; // defined after intvec
+  boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
+
+  bool all() const { return vec_all_ne(v, BV(false).v); }
+  bool any() const { return vec_any_ne(v, BV(false).v); }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef int_t scalar_t;
+  typedef __vector signed int ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(vec_splats(a)) {}
+  intvec(int_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+  static intvec iota() { return (__vector signed int){0, 1, 2, 3}; }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  // Vector casts do not change the bit battern
+  boolvec_t as_bool() const { return (__vector __bool int)v; }
+  boolvec_t convert_bool() const { return *this != IV(0); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const {
 #if defined __xlC_
-      return vec_neg(v);
+    return vec_neg(v);
 #else
-      // vec_neg does not exist in clang
-      return IV(I(0)) - *this;
+    // vec_neg does not exist in clang
+    return IV(I(0)) - *this;
 #endif
+  }
+
+  intvec operator+(intvec x) const { return vec_add(v, x.v); }
+  intvec operator-(intvec x) const { return vec_sub(v, x.v); }
+
+  intvec &operator+=(intvec const &x) { return *this = *this + x; }
+  intvec &operator-=(intvec const &x) { return *this = *this - x; }
+
+  intvec operator~() const { return vec_nor(v, v); }
+
+  intvec operator&(intvec x) const { return vec_and(v, x.v); }
+  intvec operator|(intvec x) const { return vec_or(v, x.v); }
+  intvec operator^(intvec x) const { return vec_xor(v, x.v); }
+
+  intvec &operator&=(intvec const &x) { return *this = *this & x; }
+  intvec &operator|=(intvec const &x) { return *this = *this | x; }
+  intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const { return lsr(IV(n)); }
+  intvec_t rotate(int_t n) const;
+  intvec operator>>(int_t n) const { return *this >> IV(n); }
+  intvec operator<<(int_t n) const { return *this << IV(n); }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+    return vec_sr(v, (__vector unsigned int)n.v);
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec operator>>(intvec n) const {
+    return vec_sra(v, (__vector unsigned int)n.v);
+  }
+  intvec operator<<(intvec n) const {
+    return vec_sl(v, (__vector unsigned int)n.v);
+  }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec const &x) const { return vec_cmpeq(v, x.v); }
+  boolvec_t operator!=(intvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(intvec const &x) const { return vec_cmplt(v, x.v); }
+  boolvec_t operator<=(intvec const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec const &x) const { return vec_cmpgt(v, x.v); }
+  boolvec_t operator>=(intvec const &x) const { return !(*this < x); }
+
+  intvec_t abs() const { return vec_abs(v); }
+  boolvec_t isignbit() const { return (*this >> (bits - 1)).as_bool(); }
+  intvec_t max(intvec_t x) const { return vec_max(v, x.v); }
+  intvec_t min(intvec_t x) const { return vec_min(v, x.v); }
+};
+
+template <> struct realvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef real_t scalar_t;
+  typedef __vector float vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<Altivec:4*float>"; }
+  void barrier() { __asm__("" : "+v"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(vec_splats(a)) {}
+  realvec(real_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return vec_ld(0, p);
+  }
+  static realvec_t loadu(real_t const *p) {
+    realvec_t v0 = vec_ld(0, p);
+    realvec_t v1 = vec_ld(15, p);
+    return vec_perm(v0.v, v1.v, vec_lvsl(0, p));
+  }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    
-    intvec operator+(intvec x) const { return vec_add(v, x.v); }
-    intvec operator-(intvec x) const { return vec_sub(v, x.v); }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return vec_nor(v, v); }
-    
-    intvec operator&(intvec x) const { return vec_and(v, x.v); }
-    intvec operator|(intvec x) const { return vec_or(v, x.v); }
-    intvec operator^(intvec x) const { return vec_xor(v, x.v); }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const { return lsr(IV(n)); }
-    intvec_t rotate(int_t n) const;
-    intvec operator>>(int_t n) const { return *this >> IV(n); }
-    intvec operator<<(int_t n) const { return *this << IV(n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      return vec_sr(v, (__vector unsigned int)n.v);
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec operator>>(intvec n) const
-    {
-      return vec_sra(v, (__vector unsigned int)n.v);
-    }
-    intvec operator<<(intvec n) const
-    {
-      return vec_sl(v, (__vector unsigned int)n.v);
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const { return vec_cmpeq(v, x.v); }
-    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(intvec const& x) const { return vec_cmplt(v, x.v); }
-    boolvec_t operator<=(intvec const& x) const { return !(*this > x); }
-    boolvec_t operator>(intvec const& x) const { return vec_cmpgt(v, x.v); }
-    boolvec_t operator>=(intvec const& x) const { return !(*this < x); }
-    
-    intvec_t abs() const { return vec_abs(v); }
-    boolvec_t isignbit() const { return (*this >> (bits-1)).as_bool(); }
-    intvec_t max(intvec_t x) const { return vec_max(v, x.v); }
-    intvec_t min(intvec_t x) const { return vec_min(v, x.v); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef __vector float vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<Altivec:4*float>"; }
-    void barrier() { __asm__("": "+v"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vec_splats(a)) {}
-    realvec(real_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vec_ld(0, p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      realvec_t v0 = vec_ld(0, p);
-      realvec_t v1 = vec_ld(15, p);
-      return vec_perm(v0.v, v1.v, vec_lvsl(0, p));
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vec_st(v, 0, p);
-    }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
-      // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
-      p[2] = (*this)[2];
-      p[3] = (*this)[3];
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        // Use vec_ste?
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // Use vec_ste?
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    vec_st(v, 0, p);
+  }
+  void storeu(real_t *p) const {
+    // Vector stores would require vector loads, which would need to
+    // be atomic
+    // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html>
+    // for good ideas
+    p[0] = (*this)[0];
+    p[1] = (*this)[1];
+    p[2] = (*this)[2];
+    p[3] = (*this)[3];
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      // Use vec_ste?
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      // Use vec_ste?
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-    
-    
-    
-    intvec_t as_int() const { return (__vector signed int) v; }
-    intvec_t convert_int() const
-    {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return (__vector signed int)v; }
+  intvec_t convert_int() const {
 #if defined __xlC__
-      return vec_cts(v, 0);
+    return vec_cts(v, 0);
 #else
-      // vec_cts leads to an ICE in clang
-      return MF::vml_convert_int(*this);
+    // vec_cts leads to an ICE in clang
+    return MF::vml_convert_int(*this);
 #endif
-    }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const
-    {
+  }
+
+  realvec operator+() const { return *this; }
+  realvec operator-() const {
 #if defined __xlC_
-      return vec_neg(v);
+    return vec_neg(v);
 #else
-      // vec_neg does not exist in clang
-      return RV(0.0) - *this;
+    // vec_neg does not exist in clang
+    return RV(0.0) - *this;
 #endif
-    }
-    
-    realvec operator+(realvec x) const { return vec_add(v, x.v); }
-    realvec operator-(realvec x) const { return vec_sub(v, x.v); }
-    realvec operator*(realvec x) const {
+  }
+
+  realvec operator+(realvec x) const { return vec_add(v, x.v); }
+  realvec operator-(realvec x) const { return vec_sub(v, x.v); }
+  realvec operator*(realvec x) const {
 #if defined __xlC__
-      return vec_mul(v, x.v);
+    return vec_mul(v, x.v);
 #else
-      // vec_mul does not exist in clang
-      return vec_madd(v, x.v, RV(0.0).v);
+    // vec_mul does not exist in clang
+    return vec_madd(v, x.v, RV(0.0).v);
 #endif
-    }
-    realvec operator/(realvec x) const {
+  }
+  realvec operator/(realvec x) const {
 #if defined __xlC__
-      return vec_div(v, x.v);
+    return vec_div(v, x.v);
 #else
-      // vec_div does not exist in clang
-      return *this * x.rcp();
+    // vec_div does not exist in clang
+    return *this * x.rcp();
 #endif
-    }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
-                           vml_std::fmax((*this)[2], (*this)[3]));
-    }
-    real_t minval() const
-    {
-      return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
-                           vml_std::fmin((*this)[2], (*this)[3]));
-    }
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-    }
-    real_t sum() const
-    {
-      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); }
-    boolvec_t operator!=(realvec const& x) const { return ! (*this == x); }
-    boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); }
-    boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); }
-    boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); }
-    boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const { return vec_ceil(v); }
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return vec_abs(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return vec_floor(v); }
-    realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
-    realvec fmax(realvec y) const { return vec_max(v, y.v); }
-    realvec fmin(realvec y) const { return vec_min(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return vec_madd(v, y.v, z.v);
-    }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec x = *this;
-      realvec r = vec_re(v);    // this is only an approximation
-      // TODO: use fma
-      // Note: don't rewrite this expression, this may introduce
-      // cancellation errors
-      r += r * (RV(1.0) - x*r); // one Newton iteration (see vml_rcp)
-      return r;
-    }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const { return vec_round(v); /* sic! */ }
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const
-    {
-      realvec x = *this;
-      realvec r = vec_rsqrte(x.v); // this is only an approximation
-      // TODO: use fma
-      // one Newton iteration (see vml_rsqrt)
-      r += RV(0.5)*r * (RV(1.0) - x * r*r);
-      return r;
-    }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const {
+  }
+
+  realvec &operator+=(realvec const &x) { return *this = *this + x; }
+  realvec &operator-=(realvec const &x) { return *this = *this - x; }
+  realvec &operator*=(realvec const &x) { return *this = *this * x; }
+  realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+                         vml_std::fmax((*this)[2], (*this)[3]));
+  }
+  real_t minval() const {
+    return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+                         vml_std::fmin((*this)[2], (*this)[3]));
+  }
+  real_t prod() const {
+    return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+  }
+  real_t sum() const {
+    return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+  }
+
+  boolvec_t operator==(realvec const &x) const { return vec_cmpeq(v, x.v); }
+  boolvec_t operator!=(realvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(realvec const &x) const { return vec_cmplt(v, x.v); }
+  boolvec_t operator<=(realvec const &x) const { return vec_cmple(v, x.v); }
+  boolvec_t operator>(realvec const &x) const { return vec_cmpgt(v, x.v); }
+  boolvec_t operator>=(realvec const &x) const { return vec_cmpge(v, x.v); }
+
+  realvec acos() const { return MF::vml_acos(*this); }
+  realvec acosh() const { return MF::vml_acosh(*this); }
+  realvec asin() const { return MF::vml_asin(*this); }
+  realvec asinh() const { return MF::vml_asinh(*this); }
+  realvec atan() const { return MF::vml_atan(*this); }
+  realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+  realvec atanh() const { return MF::vml_atanh(*this); }
+  realvec cbrt() const { return MF::vml_cbrt(*this); }
+  realvec ceil() const { return vec_ceil(v); }
+  realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+  realvec cos() const { return MF::vml_cos(*this); }
+  realvec cosh() const { return MF::vml_cosh(*this); }
+  realvec exp() const { return MF::vml_exp(*this); }
+  realvec exp10() const { return MF::vml_exp10(*this); }
+  realvec exp2() const { return MF::vml_exp2(*this); }
+  realvec expm1() const { return MF::vml_expm1(*this); }
+  realvec fabs() const { return vec_abs(v); }
+  realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+  realvec floor() const { return vec_floor(v); }
+  realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
+  realvec fmax(realvec y) const { return vec_max(v, y.v); }
+  realvec fmin(realvec y) const { return vec_min(v, y.v); }
+  realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+  realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec log() const { return MF::vml_log(*this); }
+  realvec log10() const { return MF::vml_log10(*this); }
+  realvec log1p() const { return MF::vml_log1p(*this); }
+  realvec log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return vec_madd(v, y.v, z.v);
+  }
+  realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+  realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+  realvec rcp() const {
+    realvec x = *this;
+    realvec r = vec_re(v); // this is only an approximation
+    // TODO: use fma
+    // Note: don't rewrite this expression, this may introduce
+    // cancellation errors
+    r += r * (RV(1.0) - x * r); // one Newton iteration (see vml_rcp)
+    return r;
+  }
+  realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+  realvec rint() const { return vec_round(v); /* sic! */ }
+  realvec round() const { return MF::vml_round(*this); }
+  realvec rsqrt() const {
+    realvec x = *this;
+    realvec r = vec_rsqrte(x.v); // this is only an approximation
+    // TODO: use fma
+    // one Newton iteration (see vml_rsqrt)
+    r += RV(0.5) * r * (RV(1.0) - x * r * r);
+    return r;
+  }
+  boolvec_t signbit() const { return MF::vml_signbit(*this); }
+  realvec sin() const { return MF::vml_sin(*this); }
+  realvec sinh() const { return MF::vml_sinh(*this); }
+  realvec sqrt() const {
 #if defined __xlC__
-      return vec_sqrt(v);
+    return vec_sqrt(v);
 #else
-      return *this * rsqrt();
+    return *this * rsqrt();
 #endif
-    }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const { return vec_trunc(v); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<float,4> boolvec<float,4>::as_int() const
-  {
-    return (__vector signed int) v;
-  }
-  
-  inline intvec<float,4> boolvec<float,4>::convert_int() const
-  {
-    return -(__vector signed int)v;
-  }
-  
-  inline boolvec<float,4> boolvec<float,4>::operator==(boolvec_t x) const
-  {
-    return as_int() == x.as_int();
-  }
-  
-  inline
-  boolvec<float,4> boolvec<float,4>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  inline
-  intvec<float,4> boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  inline
-  realvec<float,4> boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<float,4> intvec<float,4>::as_float() const
-  {
-    return (__vector float)v;
-  }
-  
-  inline intvec<float,4> intvec<float,4>::bitifthen(intvec_t x,
-                                                    intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline realvec<float,4> intvec<float,4>::convert_float() const
-  {
+  }
+  realvec tan() const { return MF::vml_tan(*this); }
+  realvec tanh() const { return MF::vml_tanh(*this); }
+  realvec trunc() const { return vec_trunc(v); }
+};
+
+// boolvec definitions
+
+inline intvec<float, 4> boolvec<float, 4>::as_int() const {
+  return (__vector signed int)v;
+}
+
+inline intvec<float, 4> boolvec<float, 4>::convert_int() const {
+  return -(__vector signed int)v;
+}
+
+inline boolvec<float, 4> boolvec<float, 4>::operator==(boolvec_t x) const {
+  return as_int() == x.as_int();
+}
+
+inline boolvec<float, 4> boolvec<float, 4>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+inline intvec<float, 4> boolvec<float, 4>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+inline realvec<float, 4> boolvec<float, 4>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline realvec<float, 4> intvec<float, 4>::as_float() const {
+  return (__vector float)v;
+}
+
+inline intvec<float, 4> intvec<float, 4>::bitifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<float, 4> intvec<float, 4>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline realvec<float, 4> intvec<float, 4>::convert_float() const {
 #if defined __xlC__
-    return vec_ctf(v, 0);
+  return vec_ctf(v, 0);
 #else
-      // vec_ctf leads to an ICE in clang
-    return MF::vml_convert_float(*this);
+  // vec_ctf leads to an ICE in clang
+  return MF::vml_convert_float(*this);
 #endif
-  }
-  
-  inline intvec<float,4> intvec<float,4>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+}
+
+inline intvec<float, 4> intvec<float, 4>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_ALTIVEC_FLOAT4_H
+#endif // #ifndef VEC_ALTIVEC_FLOAT4_H
diff --git a/vec_avx_double4.h b/vec_avx_double4.h
index 1352712..f01e74c 100644
--- a/vec_avx_double4.h
+++ b/vec_avx_double4.h
@@ -12,288 +12,244 @@
 // AVX intrinsics
 #include <immintrin.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_DOUBLE_4
-  template<> struct boolvec<double,4>;
-  template<> struct intvec<double,4>;
-  template<> struct realvec<double,4>;
-  
-  
-  
-  template<>
-  struct boolvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef __m256d bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a):
-    v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {}
-    boolvec(bool const* as):
-    v(_mm256_castsi256_pd(_mm256_set_epi64x(from_bool(as[3]),
-                                            from_bool(as[2]),
-                                            from_bool(as[1]),
-                                            from_bool(as[0])))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec_t& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec_t operator!() const { return _mm256_xor_pd(boolvec(true), v); }
-    
-    boolvec_t operator&&(boolvec_t x) const { return _mm256_and_pd(v, x.v); }
-    boolvec_t operator||(boolvec_t x) const { return _mm256_or_pd(v, x.v); }
-    boolvec_t operator==(boolvec_t x) const { return !(*this!=x); }
-    boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_pd(v, x.v); }
-    
-    bool all() const
-    {
-      // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
-      return ! (! *this).any();
-    }
-    bool any() const
-    {
-      // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
-      return ! bool(_mm256_testz_pd(v, v));
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef __m256i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm256_set1_epi64x(a)) {}
-    intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {}
-    static intvec_t iota() { return _mm256_set_epi64x(3, 2, 1, 0); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    boolvec_t as_bool() const { return _mm256_castsi256_pd(v); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
+template <> struct boolvec<double, 4>;
+template <> struct intvec<double, 4>;
+template <> struct realvec<double, 4>;
+
+template <> struct boolvec<double, 4> : floatprops<double> {
+  static int const size = 4;
+  typedef bool scalar_t;
+  typedef __m256d bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values have the sign bit set, false values have it unset
+  static uint_t from_bool(bool a) { return -uint_t(a); }
+  static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {}
+  boolvec(bool const *as)
+      : v(_mm256_castsi256_pd(
+            _mm256_set_epi64x(from_bool(as[3]), from_bool(as[2]),
+                              from_bool(as[1]), from_bool(as[0])))) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec_t &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec_t operator!() const { return _mm256_xor_pd(boolvec(true), v); }
+
+  boolvec_t operator&&(boolvec_t x) const { return _mm256_and_pd(v, x.v); }
+  boolvec_t operator||(boolvec_t x) const { return _mm256_or_pd(v, x.v); }
+  boolvec_t operator==(boolvec_t x) const { return !(*this != x); }
+  boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_pd(v, x.v); }
+
+  bool all() const {
+    // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+    return !(!*this).any();
+  }
+  bool any() const {
+    // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+    return !bool(_mm256_testz_pd(v, v));
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 4> : floatprops<double> {
+  static int const size = 4;
+  typedef int_t scalar_t;
+  typedef __m256i ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm256_set1_epi64x(a)) {}
+  intvec(int_t const *as) : v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {}
+  static intvec_t iota() { return _mm256_set_epi64x(3, 2, 1, 0); }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  boolvec_t as_bool() const { return _mm256_castsi256_pd(v); }
+  boolvec_t convert_bool() const {
+// Result: convert_bool(0)=false, convert_bool(else)=true
 #ifdef __AVX2__
-      return *this != IV(I(0));
+    return *this != IV(I(0));
 #else
-      // There is no intrinsic to compare to zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec_t x = *this;
-      // We know that boolvec_t values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+    // There is no intrinsic to compare to zero. Instead, we check
+    // whether x is positive and x-1 is negative.
+    intvec_t x = *this;
+    // We know that boolvec_t values depend only on the sign bit
+    // return (~(x-1) | x).as_bool();
+    // return x.as_bool() || !(x-1).as_bool();
+    return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
 #endif
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec_t operator+() const { return *this; }
-    intvec_t operator-() const { return IV(I(0)) - *this; }
-    
-    intvec_t operator+(intvec_t x) const
-    {
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec_t operator+() const { return *this; }
+  intvec_t operator-() const { return IV(I(0)) - *this; }
+
+  intvec_t operator+(intvec_t x) const {
 #ifdef __AVX2__
-      return _mm256_add_epi64(v, x.v);
+    return _mm256_add_epi64(v, x.v);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_add_epi64(vlo, xvlo);
-      vhi = _mm_add_epi64(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_add_epi64(vlo, xvlo);
+    vhi = _mm_add_epi64(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec_t operator-(intvec_t x) const
-    {
+  }
+  intvec_t operator-(intvec_t x) const {
 #ifdef __AVX2__
-      return _mm256_sub_epi64(v, x.v);
+    return _mm256_sub_epi64(v, x.v);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_sub_epi64(vlo, xvlo);
-      vhi = _mm_sub_epi64(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_sub_epi64(vlo, xvlo);
+    vhi = _mm_sub_epi64(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    
-    intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
-    intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec_t operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec_t operator&(intvec_t x) const
-    {
+  }
+
+  intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+  intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+
+  intvec_t operator~() const { return IV(~U(0)) ^ *this; }
+
+  intvec_t operator&(intvec_t x) const {
 #ifdef __AVX2__
-      return _mm256_and_si256(v, x.v);
+    return _mm256_and_si256(v, x.v);
 #else
-      return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(v),
-                                               _mm256_castsi256_pd(x.v)));
+    return _mm256_castpd_si256(
+        _mm256_and_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v)));
 #endif
-    }
-    intvec_t operator|(intvec_t x) const
-    {
+  }
+  intvec_t operator|(intvec_t x) const {
 #ifdef __AVX2__
-      return _mm256_or_si256(v, x.v);
+    return _mm256_or_si256(v, x.v);
 #else
-      return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(v),
-                                              _mm256_castsi256_pd(x.v)));
+    return _mm256_castpd_si256(
+        _mm256_or_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v)));
 #endif
-    }
-    intvec_t operator^(intvec_t x) const
-    {
+  }
+  intvec_t operator^(intvec_t x) const {
 #ifdef __AVX2__
-      return _mm256_xor_si256(v, x.v);
+    return _mm256_xor_si256(v, x.v);
 #else
-      return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(v),
-                                               _mm256_castsi256_pd(x.v)));
+    return _mm256_castpd_si256(
+        _mm256_xor_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v)));
 #endif
-    }
-    
-    intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
-    intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
-    intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const
-    {
+  }
+
+  intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+  intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+  intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const {
 #ifdef __AVX2__
-      return _mm256_srli_epi64(v, n);
+    return _mm256_srli_epi64(v, n);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srli_epi64(vlo, n);
-      vhi = _mm_srli_epi64(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_srli_epi64(vlo, n);
+    vhi = _mm_srli_epi64(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec_t rotate(int_t n) const;
-    intvec_t operator>>(int_t n) const
-    {
+  }
+  intvec_t rotate(int_t n) const;
+  intvec_t operator>>(int_t n) const {
 #ifdef __AVX2__
-      // There is no _mm256_srai_epi64. To emulate it, add 0x80000000
-      // before shifting, and subtract the shifted 0x80000000 after
-      // shifting
-      intvec_t offset = U(1) << (bits-1);
-      return (*this + offset).lsr(n) - offset.lsr(n);
+    // There is no _mm256_srai_epi64. To emulate it, add 0x80000000
+    // before shifting, and subtract the shifted 0x80000000 after
+    // shifting
+    intvec_t offset = U(1) << (bits - 1);
+    return (*this + offset).lsr(n) - offset.lsr(n);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      // There is no _mm_srai_epi64. To emulate it, add 0x80000000
-      // before shifting, and subtract the shifted 0x80000000 after
-      // shifting
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+// There is no _mm_srai_epi64. To emulate it, add 0x80000000
+// before shifting, and subtract the shifted 0x80000000 after
+// shifting
 #if 0
       __m128i signmask01 = _mm_sub_epi64(_mm_set1_epi64x(0),
                                          _mm_srli_epi64(vlo, 63));
@@ -306,532 +262,445 @@ namespace vecmathlib {
       vlo = _mm_xor_si128(signmask01, vlo);
       vhi = _mm_xor_si128(signmask23, vhi);
 #else
-      // Convert signed to unsiged
-      vlo = _mm_add_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1)));
-      vhi = _mm_add_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1)));
-      // Shift
-      vlo = _mm_srli_epi64(vlo, n);
-      vhi = _mm_srli_epi64(vhi, n);
-      // Undo conversion
-      vlo = _mm_sub_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1-n)));
-      vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1-n)));
+    // Convert signed to unsiged
+    vlo = _mm_add_epi64(vlo, _mm_set1_epi64x(U(1) << (bits - 1)));
+    vhi = _mm_add_epi64(vhi, _mm_set1_epi64x(U(1) << (bits - 1)));
+    // Shift
+    vlo = _mm_srli_epi64(vlo, n);
+    vhi = _mm_srli_epi64(vhi, n);
+    // Undo conversion
+    vlo = _mm_sub_epi64(vlo, _mm_set1_epi64x(U(1) << (bits - 1 - n)));
+    vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits - 1 - n)));
 #endif
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec_t operator<<(int_t n) const
-    {
+  }
+  intvec_t operator<<(int_t n) const {
 #ifdef __AVX2__
-      return _mm256_slli_epi64(v, n);
+    return _mm256_slli_epi64(v, n);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_slli_epi64(vlo, n);
-      vhi = _mm_slli_epi64(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_slli_epi64(vlo, n);
+    vhi = _mm_slli_epi64(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
+  }
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
 #ifdef __AVX2__
-      return _mm256_srlv_epi64(v, n.v);
+    return _mm256_srlv_epi64(v, n.v);
 #else
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-#endif
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t operator>>(intvec_t n) const
-    {
+    return r;
+#endif
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t operator>>(intvec_t n) const {
 #ifdef __AVX2__
-      // See operator>> above
-      intvec_t offset = U(1) << (bits-1);
-      return (*this + offset).lsr(n) - offset.lsr(n);
+    // See operator>> above
+    intvec_t offset = U(1) << (bits - 1);
+    return (*this + offset).lsr(n) - offset.lsr(n);
 #else
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-#endif
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    intvec_t operator<<(intvec_t n) const
-    {
+    return r;
+#endif
+  }
+  intvec_t operator<<(intvec_t n) const {
 #ifdef __AVX2__
-      return _mm256_sllv_epi64(v, n.v);
+    return _mm256_sllv_epi64(v, n.v);
 #else
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-#endif
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const
-    {
+    return r;
+#endif
+  }
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec_t const &x) const {
 #ifdef __AVX2__
-      return _mm256_castsi256_pd(_mm256_cmpeq_epi64(v, x.v));
+    return _mm256_castsi256_pd(_mm256_cmpeq_epi64(v, x.v));
 #else
-      return ! (*this != x);
+    return !(*this != x);
 #endif
-    }
-    boolvec_t operator!=(intvec_t const& x) const
-    {
+  }
+  boolvec_t operator!=(intvec_t const &x) const {
 #ifdef __AVX2__
-      return ! (*this == x);
+    return !(*this == x);
 #else
-      return (*this ^ x).convert_bool();
+    return (*this ^ x).convert_bool();
 #endif
-    }
-    boolvec_t operator<(intvec_t const& x) const
-    {
+  }
+  boolvec_t operator<(intvec_t const &x) const {
 #ifdef __AVX2__
-      return _mm256_castsi256_pd(_mm256_cmpgt_epi64(x.v, v));
+    return _mm256_castsi256_pd(_mm256_cmpgt_epi64(x.v, v));
 #else
-      // return (*this - x).as_bool();
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-#endif
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      return ! (*this > x);
+    // return (*this - x).as_bool();
+    boolvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] < x[i]);
     }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      return ! (*this < x);
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const { return as_bool(); }
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef __m256d vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() {
+    return r;
+#endif
+  }
+  boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 4> : floatprops<double> {
+  static int const size = 4;
+  typedef real_t scalar_t;
+  typedef __m256d vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() {
 #ifdef __AVX2__
-      return "<AVX2:4*double>";
+    return "<AVX2:4*double>";
 #else
-      return "<AVX:4*double>";
+    return "<AVX:4*double>";
 #endif
+  }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm256_set1_pd(a)) {}
+  realvec(real_t const *as) : v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm256_load_pd(p);
+  }
+  static realvec_t loadu(real_t const *p) { return _mm256_loadu_pd(p); }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm256_set1_pd(a)) {}
-    realvec(real_t const* as): v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm256_load_pd(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm256_loadu_pd(p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm256_store_pd(p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm256_storeu_pd(p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        _mm256_maskstore_pd(p, m.m.as_int(), v);
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        for (int d=0; d<size; ++d) {
-          if (m.m[d]) p[d] = (*this)[d];
-        }
-      }
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm256_store_pd(p, v);
+  }
+  void storeu(real_t *p) const { return _mm256_storeu_pd(p, v); }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      _mm256_maskstore_pd(p, m.m.as_int(), v);
     }
-    
-    
-    
-    intvec_t as_int() const { return _mm256_castpd_si256(v); }
-    intvec_t convert_int() const
-    {
-      intvec_t r;
-      for (int d=0; d<size; ++d) {
-        r.set_elt(d, floatprops::convert_int((*this)[d]));
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      for (int d = 0; d < size; ++d) {
+        if (m.m[d])
+          p[d] = (*this)[d];
       }
-      return r;
-    }
-    
-    
-    
-    realvec_t operator+() const { return *this; }
-    realvec_t operator-() const { return RV(0.0) - *this; }
-    
-    realvec_t operator+(realvec_t x) const { return _mm256_add_pd(v, x.v); }
-    realvec_t operator-(realvec_t x) const { return _mm256_sub_pd(v, x.v); }
-    realvec_t operator*(realvec_t x) const { return _mm256_mul_pd(v, x.v); }
-    realvec_t operator/(realvec_t x) const { return _mm256_div_pd(v, x.v); }
-    
-    realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
-    realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
-    realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
-    realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
-      //                      vml_std::fmax((*this)[2], (*this)[3]));
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
-      realvec_t y0022 = x0123.fmax(x1032);
-      return vml_std::fmax(y0022[0], y0022[2]);
-    }
-    real_t minval() const
-    {
-      // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
-      //                      vml_std::fmin((*this)[2], (*this)[3]));
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
-      realvec_t y0022 = x0123.fmin(x1032);
-      return vml_std::fmin(y0022[0], y0022[2]);
-    }
-    real_t prod() const
-    {
-      // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
-      realvec_t y0022 = x0123 * x1032;
-      return y0022[0] * y0022[2];
-    }
-    real_t sum() const
-    {
-      // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-      // __m256d x = _mm256_hadd_pd(v, v);
-      // __m128d xlo = _mm256_extractf128_pd(x, 0);
-      // __m128d xhi = _mm256_extractf128_pd(x, 1);
-      realvec_t x = *this;
-      x = _mm256_hadd_pd(x.v, x.v);
-      return x[0] + x[2];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ);
-    }
-    boolvec_t operator!=(realvec_t const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
-    }
-    boolvec_t operator<(realvec_t const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ);
-    }
-    boolvec_t operator<=(realvec_t const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ);
-    }
-    boolvec_t operator>(realvec_t const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ);
-    }
-    boolvec_t operator>=(realvec_t const& x) const
-    {
-      return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ);
-    }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const { return _mm256_ceil_pd(v); }
-    realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return MF::vml_fabs(*this); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const { return _mm256_floor_pd(v); }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
     }
-    realvec_t fmax(realvec_t y) const { return _mm256_max_pd(v, y.v); }
-    realvec_t fmin(realvec_t y) const { return _mm256_min_pd(v, y.v); }
-    realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
-    realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const
-    {
-#ifdef VML_HAVE_NAN
-      return _mm256_cmp_pd(v, v, _CMP_UNORD_Q);
-#else
-      return BV(false);
-#endif
-    }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); }
-    realvec_t remainder(realvec_t y) const
-    {
-      return MF::vml_remainder(*this, y);
-    }
-    realvec_t rint() const
-    {
-      return _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
-    }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return v; }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const { return _mm256_sqrt_pd(v); }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const { return _mm256_round_pd(v, _MM_FROUND_TO_ZERO); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<double,4> boolvec<double,4>::as_int() const
-  {
-    return _mm256_castpd_si256(v);
-  }
-  
-  inline intvec<double,4> boolvec<double,4>::convert_int() const
-  {
-    //return ifthen(v, U(1), U(0));
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  boolvec<double,4> boolvec<double,4>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline
-  intvec<double,4> boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  realvec<double,4> boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return _mm256_blendv_pd(y.v, x.v, v);
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline intvec<double,4> intvec<double,4>::abs() const
-  {
-    return MF::vml_abs(*this);
-  }
-  
-  inline
-  intvec<double,4> intvec<double,4>::bitifthen(intvec_t x, intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline realvec<double,4> intvec<double,4>::as_float() const
-  {
-    return _mm256_castsi256_pd(v);
-  }
-  
-  inline realvec<double,4> intvec<double,4>::convert_float() const
-  {
-    realvec_t r;
-    for (int d=0; d<size; ++d) {
-      r.set_elt(d, floatprops::convert_float((*this)[d]));
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return _mm256_castpd_si256(v); }
+  intvec_t convert_int() const {
+    intvec_t r;
+    for (int d = 0; d < size; ++d) {
+      r.set_elt(d, floatprops::convert_int((*this)[d]));
     }
     return r;
   }
-  
-  inline intvec<double,4> intvec<double,4>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
+
+  realvec_t operator+() const { return *this; }
+  realvec_t operator-() const { return RV(0.0) - *this; }
+
+  realvec_t operator+(realvec_t x) const { return _mm256_add_pd(v, x.v); }
+  realvec_t operator-(realvec_t x) const { return _mm256_sub_pd(v, x.v); }
+  realvec_t operator*(realvec_t x) const { return _mm256_mul_pd(v, x.v); }
+  realvec_t operator/(realvec_t x) const { return _mm256_div_pd(v, x.v); }
+
+  realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+  realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+  realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+  realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+    //                      vml_std::fmax((*this)[2], (*this)[3]));
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
+    realvec_t y0022 = x0123.fmax(x1032);
+    return vml_std::fmax(y0022[0], y0022[2]);
+  }
+  real_t minval() const {
+    // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+    //                      vml_std::fmin((*this)[2], (*this)[3]));
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
+    realvec_t y0022 = x0123.fmin(x1032);
+    return vml_std::fmin(y0022[0], y0022[2]);
+  }
+  real_t prod() const {
+    // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
+    realvec_t y0022 = x0123 * x1032;
+    return y0022[0] * y0022[2];
+  }
+  real_t sum() const {
+    // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+    // __m256d x = _mm256_hadd_pd(v, v);
+    // __m128d xlo = _mm256_extractf128_pd(x, 0);
+    // __m128d xhi = _mm256_extractf128_pd(x, 1);
+    realvec_t x = *this;
+    x = _mm256_hadd_pd(x.v, x.v);
+    return x[0] + x[2];
+  }
+
+  boolvec_t operator==(realvec_t const &x) const {
+    return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ);
+  }
+  boolvec_t operator!=(realvec_t const &x) const {
+    return _mm256_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
+  }
+  boolvec_t operator<(realvec_t const &x) const {
+    return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ);
+  }
+  boolvec_t operator<=(realvec_t const &x) const {
+    return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ);
+  }
+  boolvec_t operator>(realvec_t const &x) const {
+    return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ);
   }
-  
-  inline intvec<double,4> intvec<double,4>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
+  boolvec_t operator>=(realvec_t const &x) const {
+    return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ);
   }
-  
-  inline intvec<double,4> intvec<double,4>::popcount() const
-  {
-    return MF::vml_popcount(*this);
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const { return _mm256_ceil_pd(v); }
+  realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return MF::vml_fabs(*this); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const { return _mm256_floor_pd(v); }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return _mm256_max_pd(v, y.v); }
+  realvec_t fmin(realvec_t y) const { return _mm256_min_pd(v, y.v); }
+  realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+  realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const {
+#ifdef VML_HAVE_NAN
+    return _mm256_cmp_pd(v, v, _CMP_UNORD_Q);
+#else
+    return BV(false);
+#endif
+  }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
   }
-  
-  inline intvec<double,4> intvec<double,4>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); }
+  realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+  realvec_t rint() const {
+    return _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
   }
-  
-  inline intvec<double,4> intvec<double,4>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+  boolvec_t signbit() const { return v; }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const { return _mm256_sqrt_pd(v); }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const { return _mm256_round_pd(v, _MM_FROUND_TO_ZERO); }
+};
+
+// boolvec definitions
+
+inline intvec<double, 4> boolvec<double, 4>::as_int() const {
+  return _mm256_castpd_si256(v);
+}
+
+inline intvec<double, 4> boolvec<double, 4>::convert_int() const {
+  // return ifthen(v, U(1), U(0));
+  return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<double, 4> boolvec<double, 4>::ifthen(boolvec_t x,
+                                                     boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<double, 4> boolvec<double, 4>::ifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<double, 4> boolvec<double, 4>::ifthen(realvec_t x,
+                                                     realvec_t y) const {
+  return _mm256_blendv_pd(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline intvec<double, 4> intvec<double, 4>::abs() const {
+  return MF::vml_abs(*this);
+}
+
+inline intvec<double, 4> intvec<double, 4>::bitifthen(intvec_t x,
+                                                      intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 4> intvec<double, 4>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline realvec<double, 4> intvec<double, 4>::as_float() const {
+  return _mm256_castsi256_pd(v);
+}
+
+inline realvec<double, 4> intvec<double, 4>::convert_float() const {
+  realvec_t r;
+  for (int d = 0; d < size; ++d) {
+    r.set_elt(d, floatprops::convert_float((*this)[d]));
   }
-  
+  return r;
+}
+
+inline intvec<double, 4> intvec<double, 4>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 4> intvec<double, 4>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 4> intvec<double, 4>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<double, 4> intvec<double, 4>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 4> intvec<double, 4>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_AVX_DOUBLE4_H
+#endif // #ifndef VEC_AVX_DOUBLE4_H
diff --git a/vec_avx_float8.h b/vec_avx_float8.h
index ec1e132..f119aee 100644
--- a/vec_avx_float8.h
+++ b/vec_avx_float8.h
@@ -12,828 +12,697 @@
 // AVX intrinsics
 #include <immintrin.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FLOAT_8
-  template<> struct boolvec<float,8>;
-  template<> struct intvec<float,8>;
-  template<> struct realvec<float,8>;
-  
-  
-  
-  template<>
-  struct boolvec<float,8>: floatprops<float>
-  {
-    static int const size = 8;
-    typedef bool scalar_t;
-    typedef __m256 bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a):
-    v(_mm256_castsi256_ps(_mm256_set1_epi32(from_bool(a)))) {}
-    boolvec(bool const* as):
-    v(_mm256_castsi256_ps(_mm256_set_epi32(from_bool(as[7]),
-                                           from_bool(as[6]),
-                                           from_bool(as[5]),
-                                           from_bool(as[4]),
-                                           from_bool(as[3]),
-                                           from_bool(as[2]),
-                                           from_bool(as[1]),
-                                           from_bool(as[0])))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec_t& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec_t operator!() const { return _mm256_xor_ps(boolvec(true), v); }
-    
-    boolvec_t operator&&(boolvec_t x) const { return _mm256_and_ps(v, x.v); }
-    boolvec_t operator||(boolvec_t x) const { return _mm256_or_ps(v, x.v); }
-    boolvec_t operator==(boolvec_t x) const { return !(*this!=x); }
-    boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_ps(v, x.v); }
-    
-    bool all() const
-    {
-      // return
-      //   (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] &&
-      //   (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7];
-      return ! (! *this).any();
-    }
-    bool any() const
-    {
-      // return
-      //   (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] ||
-      //   (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7];
-      return ! bool(_mm256_testz_ps(v, v));
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,8>: floatprops<float>
-  {
-    static int const size = 8;
-    typedef int_t scalar_t;
-    typedef __m256i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm256_set1_epi32(a)) {}
-    intvec(int_t const* as): v(_mm256_set_epi32(as[7], as[6], as[5], as[4],
-                                                as[3], as[2], as[1], as[0])) {}
-    static intvec_t iota() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    boolvec_t as_bool() const { return _mm256_castsi256_ps(v); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
+template <> struct boolvec<float, 8>;
+template <> struct intvec<float, 8>;
+template <> struct realvec<float, 8>;
+
+template <> struct boolvec<float, 8> : floatprops<float> {
+  static int const size = 8;
+  typedef bool scalar_t;
+  typedef __m256 bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values have the sign bit set, false values have it unset
+  static uint_t from_bool(bool a) { return -uint_t(a); }
+  static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(_mm256_castsi256_ps(_mm256_set1_epi32(from_bool(a)))) {}
+  boolvec(bool const *as)
+      : v(_mm256_castsi256_ps(_mm256_set_epi32(
+            from_bool(as[7]), from_bool(as[6]), from_bool(as[5]),
+            from_bool(as[4]), from_bool(as[3]), from_bool(as[2]),
+            from_bool(as[1]), from_bool(as[0])))) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec_t &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec_t operator!() const { return _mm256_xor_ps(boolvec(true), v); }
+
+  boolvec_t operator&&(boolvec_t x) const { return _mm256_and_ps(v, x.v); }
+  boolvec_t operator||(boolvec_t x) const { return _mm256_or_ps(v, x.v); }
+  boolvec_t operator==(boolvec_t x) const { return !(*this != x); }
+  boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_ps(v, x.v); }
+
+  bool all() const {
+    // return
+    //   (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] &&
+    //   (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7];
+    return !(!*this).any();
+  }
+  bool any() const {
+    // return
+    //   (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] ||
+    //   (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7];
+    return !bool(_mm256_testz_ps(v, v));
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 8> : floatprops<float> {
+  static int const size = 8;
+  typedef int_t scalar_t;
+  typedef __m256i ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm256_set1_epi32(a)) {}
+  intvec(int_t const *as)
+      : v(_mm256_set_epi32(as[7], as[6], as[5], as[4], as[3], as[2], as[1],
+                           as[0])) {}
+  static intvec_t iota() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  boolvec_t as_bool() const { return _mm256_castsi256_ps(v); }
+  boolvec_t convert_bool() const {
+// Result: convert_bool(0)=false, convert_bool(else)=true
 #ifdef __AVX2__
-      return *this != IV(I(0));
+    return *this != IV(I(0));
 #else
-      // There is no intrinsic to compare to zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec_t x = *this;
-      // We know that boolvec_t values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+    // There is no intrinsic to compare to zero. Instead, we check
+    // whether x is positive and x-1 is negative.
+    intvec_t x = *this;
+    // We know that boolvec_t values depend only on the sign bit
+    // return (~(x-1) | x).as_bool();
+    // return x.as_bool() || !(x-1).as_bool();
+    return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
 #endif
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec_t operator+() const { return *this; }
-    intvec_t operator-() const { return IV(0) - *this; }
-    
-    intvec_t operator+(intvec_t x) const
-    {
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec_t operator+() const { return *this; }
+  intvec_t operator-() const { return IV(0) - *this; }
+
+  intvec_t operator+(intvec_t x) const {
 #ifdef __AVX2__
-      return _mm256_add_epi32(v, x.v);
+    return _mm256_add_epi32(v, x.v);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_add_epi32(vlo, xvlo);
-      vhi = _mm_add_epi32(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_add_epi32(vlo, xvlo);
+    vhi = _mm_add_epi32(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec_t operator-(intvec_t x) const
-    {
+  }
+  intvec_t operator-(intvec_t x) const {
 #ifdef __AVX2__
-      return _mm256_sub_epi32(v, x.v);
+    return _mm256_sub_epi32(v, x.v);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_sub_epi32(vlo, xvlo);
-      vhi = _mm_sub_epi32(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_sub_epi32(vlo, xvlo);
+    vhi = _mm_sub_epi32(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    
-    intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
-    intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec_t operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec_t operator&(intvec_t x) const
-    {
+  }
+
+  intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+  intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+
+  intvec_t operator~() const { return IV(~U(0)) ^ *this; }
+
+  intvec_t operator&(intvec_t x) const {
 #ifdef __AVX2__
-      return _mm256_and_si256(v, x.v);
+    return _mm256_and_si256(v, x.v);
 #else
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
+    return _mm256_castps_si256(
+        _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
 #endif
-    }
-    intvec_t operator|(intvec_t x) const
-    {
+  }
+  intvec_t operator|(intvec_t x) const {
 #ifdef __AVX2__
-      return _mm256_or_si256(v, x.v);
+    return _mm256_or_si256(v, x.v);
 #else
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
+    return _mm256_castps_si256(
+        _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
 #endif
-    }
-    intvec_t operator^(intvec_t x) const
-    {
+  }
+  intvec_t operator^(intvec_t x) const {
 #ifdef __AVX2__
-      return _mm256_xor_si256(v, x.v);
+    return _mm256_xor_si256(v, x.v);
 #else
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
+    return _mm256_castps_si256(
+        _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
 #endif
-    }
-    
-    intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
-    intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
-    intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const
-    {
+  }
+
+  intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+  intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+  intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const {
 #ifdef __AVX2__
-      return _mm256_srli_epi32(v, n);
+    return _mm256_srli_epi32(v, n);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srli_epi32(vlo, n);
-      vhi = _mm_srli_epi32(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_srli_epi32(vlo, n);
+    vhi = _mm_srli_epi32(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec_t rotate(int_t n) const;
-    intvec_t operator>>(int_t n) const
-    {
+  }
+  intvec_t rotate(int_t n) const;
+  intvec_t operator>>(int_t n) const {
 #ifdef __AVX2__
-      return _mm256_srai_epi32(v, n);
+    return _mm256_srai_epi32(v, n);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srai_epi32(vlo, n);
-      vhi = _mm_srai_epi32(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_srai_epi32(vlo, n);
+    vhi = _mm_srai_epi32(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec_t operator<<(int_t n) const
-    {
+  }
+  intvec_t operator<<(int_t n) const {
 #ifdef __AVX2__
-      return _mm256_slli_epi32(v, n);
+    return _mm256_slli_epi32(v, n);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_slli_epi32(vlo, n);
-      vhi = _mm_slli_epi32(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_slli_epi32(vlo, n);
+    vhi = _mm_slli_epi32(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
+  }
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
 #ifdef __AVX2__
-      return _mm256_srlv_epi32(v, n.v);
+    return _mm256_srlv_epi32(v, n.v);
 #else
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-#endif
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t operator>>(intvec_t n) const
-    {
+    return r;
+#endif
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t operator>>(intvec_t n) const {
 #ifdef __AVX2__
-      return _mm256_srav_epi32(v, n.v);
+    return _mm256_srav_epi32(v, n.v);
 #else
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-#endif
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    intvec_t operator<<(intvec_t n) const
-    {
+    return r;
+#endif
+  }
+  intvec_t operator<<(intvec_t n) const {
 #ifdef __AVX2__
-      return _mm256_sllv_epi32(v, n.v);
+    return _mm256_sllv_epi32(v, n.v);
 #else
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-#endif
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const
-    {
+    return r;
+#endif
+  }
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec_t const &x) const {
 #ifdef __AVX2__
-      return _mm256_castsi256_ps(_mm256_cmpeq_epi32(v, x.v));
+    return _mm256_castsi256_ps(_mm256_cmpeq_epi32(v, x.v));
 #else
-      return ! (*this != x);
+    return !(*this != x);
 #endif
-    }
-    boolvec_t operator!=(intvec_t const& x) const
-    {
+  }
+  boolvec_t operator!=(intvec_t const &x) const {
 #ifdef __AVX2__
-      return ! (*this == x);
+    return !(*this == x);
 #else
-      return (*this ^ x).convert_bool();
+    return (*this ^ x).convert_bool();
 #endif
-    }
-    boolvec_t operator<(intvec_t const& x) const
-    {
+  }
+  boolvec_t operator<(intvec_t const &x) const {
 #ifdef __AVX2__
-      return _mm256_castsi256_ps(_mm256_cmpgt_epi32(x.v, v));
+    return _mm256_castsi256_ps(_mm256_cmpgt_epi32(x.v, v));
 #else
-      // return (*this - x).as_bool();
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-#endif
+    // return (*this - x).as_bool();
+    boolvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] < x[i]);
     }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      return ! (*this < x);
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const { return as_bool(); }
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,8>: floatprops<float>
-  {
-    static int const size = 8;
-    typedef real_t scalar_t;
-    typedef __m256 vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() {
+    return r;
+#endif
+  }
+  boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<float, 8> : floatprops<float> {
+  static int const size = 8;
+  typedef real_t scalar_t;
+  typedef __m256 vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() {
 #ifdef __AVX2__
-      return "<AVX2:8*float>";
+    return "<AVX2:8*float>";
 #else
-      return "<AVX:8*float>";
+    return "<AVX:8*float>";
 #endif
+  }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm256_set1_ps(a)) {}
+  realvec(real_t const *as)
+      : v(_mm256_set_ps(as[7], as[6], as[5], as[4], as[3], as[2], as[1],
+                        as[0])) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm256_load_ps(p);
+  }
+  static realvec_t loadu(real_t const *p) { return _mm256_loadu_ps(p); }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm256_set1_ps(a)) {}
-    realvec(real_t const* as): v(_mm256_set_ps(as[7], as[6], as[5], as[4],
-                                               as[3], as[2], as[1], as[0])) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm256_load_ps(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm256_loadu_ps(p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm256_store_ps(p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm256_storeu_ps(p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        _mm256_maskstore_ps(p, m.m.as_int(), v);
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return _mm256_castps_si256(v); }
-    intvec_t convert_int() const { return _mm256_cvttps_epi32(v); }
-    
-    
-    
-    realvec_t operator+() const { return *this; }
-    realvec_t operator-() const { return RV(0.0) - *this; }
-    
-    realvec_t operator+(realvec_t x) const { return _mm256_add_ps(v, x.v); }
-    realvec_t operator-(realvec_t x) const { return _mm256_sub_ps(v, x.v); }
-    realvec_t operator*(realvec_t x) const { return _mm256_mul_ps(v, x.v); }
-    realvec_t operator/(realvec_t x) const { return _mm256_div_ps(v, x.v); }
-    
-    realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
-    realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
-    realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
-    realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      // return
-      //   vml_std::fmax(vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
-      //                               vml_std::fmax((*this)[2], (*this)[3])),
-      //                 vml_std::fmax(vml_std::fmax((*this)[4], (*this)[5]),
-      //                               vml_std::fmax((*this)[6], (*this)[7])));
-      realvec_t x01234567 = *this;
-      realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
-      realvec_t y00224466 = x01234567.fmax(x10325476);
-      realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
-      realvec_t z00004444 = y00224466.fmax(y22006644);
-      return vml_std::fmax(z00004444[0], z00004444[4]);
-    }
-    real_t minval() const
-    {
-      // return
-      //   vml_std::fmin(vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
-      //                               vml_std::fmin((*this)[2], (*this)[3])),
-      //                 vml_std::fmin(vml_std::fmin((*this)[4], (*this)[5]),
-      //                               vml_std::fmin((*this)[6], (*this)[7])));
-      realvec_t x01234567 = *this;
-      realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
-      realvec_t y00224466 = x01234567.fmin(x10325476);
-      realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
-      realvec_t z00004444 = y00224466.fmin(y22006644);
-      return vml_std::fmin(z00004444[0], z00004444[4]);
-    }
-    real_t prod() const
-    {
-      // return
-      //   (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] *
-      //   (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7];
-      realvec_t x01234567 = *this;
-      realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
-      realvec_t y00224466 = x01234567 * x10325476;
-      realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
-      realvec_t z00004444 = y00224466 * y22006644;
-      return z00004444[0] * z00004444[4];
-    }
-    real_t sum() const
-    {
-      // return
-      //   (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3] +
-      //   (*this)[4] + (*this)[5] + (*this)[6] + (*this)[7];
-      // _m256 x = vhaddps(v, v);
-      // x = vhaddps(x, x);
-      // __m128 xlo = _mm256_extractf128_ps(x, 0);
-      // __m128 xhi = _mm256_extractf128_ps(x, 1);
-      // return _mm_cvtsd_f64(xlo) + _mm_cvtsd_f64(xhi);
-      realvec_t x = *this;
-      x = _mm256_hadd_ps(x.v, x.v);
-      x = _mm256_hadd_ps(x.v, x.v);
-      return x[0] + x[4];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_EQ_OQ);
-    }
-    boolvec_t operator!=(realvec_t const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
-    }
-    boolvec_t operator<(realvec_t const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_LT_OQ);
-    }
-    boolvec_t operator<=(realvec_t const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_LE_OQ);
-    }
-    boolvec_t operator>(realvec_t const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_GT_OQ);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    boolvec_t operator>=(realvec_t const& x) const
-    {
-      return _mm256_cmp_ps(v, x.v, _CMP_GE_OQ);
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm256_store_ps(p, v);
+  }
+  void storeu(real_t *p) const { return _mm256_storeu_ps(p, v); }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      _mm256_maskstore_ps(p, m.m.as_int(), v);
     }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const { return _mm256_ceil_ps(v); }
-    realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return MF::vml_fabs(*this); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const { return _mm256_floor_ps(v); }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      // TODO: this is expensive
+      for (int n = 0; n < size; ++n)
+        if (m.m[n])
+          p[n] = (*this)[n];
     }
-    realvec_t fmax(realvec_t y) const { return _mm256_max_ps(v, y.v); }
-    realvec_t fmin(realvec_t y) const { return _mm256_min_ps(v, y.v); }
-    realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
-    realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const
-    {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return _mm256_castps_si256(v); }
+  intvec_t convert_int() const { return _mm256_cvttps_epi32(v); }
+
+  realvec_t operator+() const { return *this; }
+  realvec_t operator-() const { return RV(0.0) - *this; }
+
+  realvec_t operator+(realvec_t x) const { return _mm256_add_ps(v, x.v); }
+  realvec_t operator-(realvec_t x) const { return _mm256_sub_ps(v, x.v); }
+  realvec_t operator*(realvec_t x) const { return _mm256_mul_ps(v, x.v); }
+  realvec_t operator/(realvec_t x) const { return _mm256_div_ps(v, x.v); }
+
+  realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+  realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+  realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+  realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    // return
+    //   vml_std::fmax(vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+    //                               vml_std::fmax((*this)[2], (*this)[3])),
+    //                 vml_std::fmax(vml_std::fmax((*this)[4], (*this)[5]),
+    //                               vml_std::fmax((*this)[6], (*this)[7])));
+    realvec_t x01234567 = *this;
+    realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
+    realvec_t y00224466 = x01234567.fmax(x10325476);
+    realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
+    realvec_t z00004444 = y00224466.fmax(y22006644);
+    return vml_std::fmax(z00004444[0], z00004444[4]);
+  }
+  real_t minval() const {
+    // return
+    //   vml_std::fmin(vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+    //                               vml_std::fmin((*this)[2], (*this)[3])),
+    //                 vml_std::fmin(vml_std::fmin((*this)[4], (*this)[5]),
+    //                               vml_std::fmin((*this)[6], (*this)[7])));
+    realvec_t x01234567 = *this;
+    realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
+    realvec_t y00224466 = x01234567.fmin(x10325476);
+    realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
+    realvec_t z00004444 = y00224466.fmin(y22006644);
+    return vml_std::fmin(z00004444[0], z00004444[4]);
+  }
+  real_t prod() const {
+    // return
+    //   (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] *
+    //   (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7];
+    realvec_t x01234567 = *this;
+    realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
+    realvec_t y00224466 = x01234567 * x10325476;
+    realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
+    realvec_t z00004444 = y00224466 * y22006644;
+    return z00004444[0] * z00004444[4];
+  }
+  real_t sum() const {
+    // return
+    //   (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3] +
+    //   (*this)[4] + (*this)[5] + (*this)[6] + (*this)[7];
+    // _m256 x = vhaddps(v, v);
+    // x = vhaddps(x, x);
+    // __m128 xlo = _mm256_extractf128_ps(x, 0);
+    // __m128 xhi = _mm256_extractf128_ps(x, 1);
+    // return _mm_cvtsd_f64(xlo) + _mm_cvtsd_f64(xhi);
+    realvec_t x = *this;
+    x = _mm256_hadd_ps(x.v, x.v);
+    x = _mm256_hadd_ps(x.v, x.v);
+    return x[0] + x[4];
+  }
+
+  boolvec_t operator==(realvec_t const &x) const {
+    return _mm256_cmp_ps(v, x.v, _CMP_EQ_OQ);
+  }
+  boolvec_t operator!=(realvec_t const &x) const {
+    return _mm256_cmp_ps(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
+  }
+  boolvec_t operator<(realvec_t const &x) const {
+    return _mm256_cmp_ps(v, x.v, _CMP_LT_OQ);
+  }
+  boolvec_t operator<=(realvec_t const &x) const {
+    return _mm256_cmp_ps(v, x.v, _CMP_LE_OQ);
+  }
+  boolvec_t operator>(realvec_t const &x) const {
+    return _mm256_cmp_ps(v, x.v, _CMP_GT_OQ);
+  }
+  boolvec_t operator>=(realvec_t const &x) const {
+    return _mm256_cmp_ps(v, x.v, _CMP_GE_OQ);
+  }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const { return _mm256_ceil_ps(v); }
+  realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return MF::vml_fabs(*this); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const { return _mm256_floor_ps(v); }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return _mm256_max_ps(v, y.v); }
+  realvec_t fmin(realvec_t y) const { return _mm256_min_ps(v, y.v); }
+  realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+  realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const {
 #ifdef VML_HAVE_NAN
-      return _mm256_cmp_ps(v, v, _CMP_UNORD_Q);
+    return _mm256_cmp_ps(v, v, _CMP_UNORD_Q);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const
-    {
-      realvec_t x = *this;
-      realvec_t r = _mm256_rcp_ps(x); // this is only an approximation
-      r *= RV(2.0) - r*x;        // one Newton iteration (see vml_rcp)
-      return r;
-    }
-    realvec_t remainder(realvec_t y) const
-    {
-      return MF::vml_remainder(*this, y);
-    }
-    realvec_t rint() const
-    {
-      return _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
-    }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const
-    {
-      realvec_t x = *this;
-      realvec_t r = _mm256_rsqrt_ps(x);    // this is only an approximation
-      r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt)
-      return r;
-    }
-    boolvec_t signbit() const { return v; }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const { return _mm256_sqrt_ps(v); }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const { return _mm256_round_ps(v, _MM_FROUND_TO_ZERO); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<float,8> boolvec<float,8>::as_int() const
-  {
-    return _mm256_castps_si256(v);
-  }
-  
-  inline intvec<float,8> boolvec<float,8>::convert_int() const
-  {
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  boolvec<float,8> boolvec<float,8>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline intvec<float,8> boolvec<float,8>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  realvec<float,8> boolvec<float,8>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return _mm256_blendv_ps(y.v, x.v, v);
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline intvec<float,8> intvec<float,8>::abs() const
-  {
+  }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const {
+    realvec_t x = *this;
+    realvec_t r = _mm256_rcp_ps(x); // this is only an approximation
+    r *= RV(2.0) - r * x;           // one Newton iteration (see vml_rcp)
+    return r;
+  }
+  realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+  realvec_t rint() const {
+    return _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
+  }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const {
+    realvec_t x = *this;
+    realvec_t r = _mm256_rsqrt_ps(x);   // this is only an approximation
+    r *= RV(1.5) - RV(0.5) * x * r * r; // one Newton iteration (see vml_rsqrt)
+    return r;
+  }
+  boolvec_t signbit() const { return v; }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const { return _mm256_sqrt_ps(v); }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const { return _mm256_round_ps(v, _MM_FROUND_TO_ZERO); }
+};
+
+// boolvec definitions
+
+inline intvec<float, 8> boolvec<float, 8>::as_int() const {
+  return _mm256_castps_si256(v);
+}
+
+inline intvec<float, 8> boolvec<float, 8>::convert_int() const {
+  return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<float, 8> boolvec<float, 8>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<float, 8> boolvec<float, 8>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<float, 8> boolvec<float, 8>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
+  return _mm256_blendv_ps(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline intvec<float, 8> intvec<float, 8>::abs() const {
 #ifdef __AVX2__
-    return _mm256_abs_epi32(v);
+  return _mm256_abs_epi32(v);
 #else
-    return MF::vml_abs(*this);
+  return MF::vml_abs(*this);
 #endif
-  }
-  
-  inline realvec<float,8> intvec<float,8>::as_float() const
-  {
-    return _mm256_castsi256_ps(v);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::bitifthen(intvec_t x,
-                                                    intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline realvec<float,8> intvec<float,8>::convert_float() const
-  {
-    return _mm256_cvtepi32_ps(v);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<float,8> intvec<float,8>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+}
+
+inline realvec<float, 8> intvec<float, 8>::as_float() const {
+  return _mm256_castsi256_ps(v);
+}
+
+inline intvec<float, 8> intvec<float, 8>::bitifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<float, 8> intvec<float, 8>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline realvec<float, 8> intvec<float, 8>::convert_float() const {
+  return _mm256_cvtepi32_ps(v);
+}
+
+inline intvec<float, 8> intvec<float, 8>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<float, 8> intvec<float, 8>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
+inline intvec<float, 8> intvec<float, 8>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<float, 8> intvec<float, 8>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 8> intvec<float, 8>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_AVX_FLOAT8_H
+#endif // #ifndef VEC_AVX_FLOAT8_H
diff --git a/vec_avx_fp16_16.h b/vec_avx_fp16_16.h
index ddade85..6af27e5 100644
--- a/vec_avx_fp16_16.h
+++ b/vec_avx_fp16_16.h
@@ -12,378 +12,309 @@
 // AVX intrinsics
 #include <immintrin.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FP16_16
-  template<> struct boolvec<fp16,16>;
-  template<> struct intvec<fp16,16>;
-  template<> struct realvec<fp16,16>;
-  
-  
-  
-  template<>
-  struct boolvec<fp16,16>: floatprops<fp16>
-  {
-    static int const size = 16;
-    typedef bool scalar_t;
-    typedef __m256i bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(_mm256_set1_epi16(from_bool(a))) {}
-    boolvec(bool const* as):
-    v(_mm256_set_epi16(from_bool(as[15]),
-                       from_bool(as[14]),
-                       from_bool(as[13]),
-                       from_bool(as[12]),
-                       from_bool(as[11]),
-                       from_bool(as[10]),
-                       from_bool(as[ 9]),
-                       from_bool(as[ 8]),
-                       from_bool(as[ 7]),
-                       from_bool(as[ 6]),
-                       from_bool(as[ 5]),
-                       from_bool(as[ 4]),
-                       from_bool(as[ 3]),
-                       from_bool(as[ 2]),
-                       from_bool(as[ 1]),
-                       from_bool(as[ 0]))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return *this != boolvec(true); }
-    
-    boolvec operator&&(boolvec x) const 
-    {
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    boolvec operator||(boolvec x) const
-    {
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
-    }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const
-    {
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    
-    bool all() const
-    {
-      bool r = (*this)[0];
-      for (int n=1; n<size; ++n) r = r && (*this)[n];
-      return r;
-    }
-    bool any() const
-    {
-      bool r = (*this)[0];;
-      for (int n=1; n<size; ++n) r = r || (*this)[n];
-      return r;
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<fp16,16>: floatprops<fp16>
-  {
-    static int const size = 16;
-    typedef int_t scalar_t;
-    typedef __m256i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm256_set1_epi16(a)) {}
-    intvec(int_t const* as):
-    v(_mm256_set_epi16(as[15],
-                       as[14],
-                       as[13],
-                       as[12],
-                       as[11],
-                       as[10],
-                       as[ 9],
-                       as[ 8],
-                       as[ 7],
-                       as[ 6],
-                       as[ 5],
-                       as[ 4],
-                       as[ 3],
-                       as[ 2],
-                       as[ 1],
-                       as[ 0])) {}
-    static intvec iota()
-    {
-      return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8,
-                              7, 6, 5, 4, 3, 2, 1, 0);
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    boolvec_t as_bool() const { return v; }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare to zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec x = *this;
-      // We know that boolvec values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(I(0)) - *this; }
-    
-    intvec operator+(intvec x) const
-    {
+template <> struct boolvec<fp16, 16>;
+template <> struct intvec<fp16, 16>;
+template <> struct realvec<fp16, 16>;
+
+template <> struct boolvec<fp16, 16> : floatprops<fp16> {
+  static int const size = 16;
+  typedef bool scalar_t;
+  typedef __m256i bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values have the sign bit set, false values have it unset
+  static uint_t from_bool(bool a) { return -uint_t(a); }
+  static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(_mm256_set1_epi16(from_bool(a))) {}
+  boolvec(bool const *as)
+      : v(_mm256_set_epi16(from_bool(as[15]), from_bool(as[14]),
+                           from_bool(as[13]), from_bool(as[12]),
+                           from_bool(as[11]), from_bool(as[10]),
+                           from_bool(as[9]), from_bool(as[8]), from_bool(as[7]),
+                           from_bool(as[6]), from_bool(as[5]), from_bool(as[4]),
+                           from_bool(as[3]), from_bool(as[2]), from_bool(as[1]),
+                           from_bool(as[0]))) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return *this != boolvec(true); }
+
+  boolvec operator&&(boolvec x) const {
+    return _mm256_castps_si256(
+        _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+  }
+  boolvec operator||(boolvec x) const {
+    return _mm256_castps_si256(
+        _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+  }
+  boolvec operator==(boolvec x) const { return !(*this != x); }
+  boolvec operator!=(boolvec x) const {
+    return _mm256_castps_si256(
+        _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+  }
+
+  bool all() const {
+    bool r = (*this)[0];
+    for (int n = 1; n < size; ++n)
+      r = r && (*this)[n];
+    return r;
+  }
+  bool any() const {
+    bool r = (*this)[0];
+    ;
+    for (int n = 1; n < size; ++n)
+      r = r || (*this)[n];
+    return r;
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<fp16, 16> : floatprops<fp16> {
+  static int const size = 16;
+  typedef int_t scalar_t;
+  typedef __m256i ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm256_set1_epi16(a)) {}
+  intvec(int_t const *as)
+      : v(_mm256_set_epi16(as[15], as[14], as[13], as[12], as[11], as[10],
+                           as[9], as[8], as[7], as[6], as[5], as[4], as[3],
+                           as[2], as[1], as[0])) {}
+  static intvec iota() {
+    return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+                            0);
+  }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  boolvec_t as_bool() const { return v; }
+  boolvec_t convert_bool() const {
+    // Result: convert_bool(0)=false, convert_bool(else)=true
+    // There is no intrinsic to compare to zero. Instead, we check
+    // whether x is positive and x-1 is negative.
+    intvec x = *this;
+    // We know that boolvec values depend only on the sign bit
+    // return (~(x-1) | x).as_bool();
+    // return x.as_bool() || !(x-1).as_bool();
+    return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const { return IV(I(0)) - *this; }
+
+  intvec operator+(intvec x) const {
 #ifdef __AVX2__
-      return _mm256_add_epi16(v, x.v);
+    return _mm256_add_epi16(v, x.v);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_add_epi16(vlo, xvlo);
-      vhi = _mm_add_epi16(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_add_epi16(vlo, xvlo);
+    vhi = _mm_add_epi16(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec operator-(intvec x) const
-    {
+  }
+  intvec operator-(intvec x) const {
 #ifdef __AVX2__
-      return _mm256_sub_epi16(v, x.v);
+    return _mm256_sub_epi16(v, x.v);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_sub_epi16(vlo, xvlo);
-      vhi = _mm_sub_epi16(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_sub_epi16(vlo, xvlo);
+    vhi = _mm_sub_epi16(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec operator&(intvec x) const
-    {
+  }
+
+  intvec &operator+=(intvec const &x) { return *this = *this + x; }
+  intvec &operator-=(intvec const &x) { return *this = *this - x; }
+
+  intvec operator~() const { return IV(~U(0)) ^ *this; }
+
+  intvec operator&(intvec x) const {
 #ifdef __AVX2__
-      return _mm256_and_si256(v, x.v);
+    return _mm256_and_si256(v, x.v);
 #else
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
+    return _mm256_castps_si256(
+        _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
 #endif
-    }
-    intvec operator|(intvec x) const
-    {
+  }
+  intvec operator|(intvec x) const {
 #ifdef __AVX2__
-      return _mm256_or_si256(v, x.v);
+    return _mm256_or_si256(v, x.v);
 #else
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
+    return _mm256_castps_si256(
+        _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
 #endif
-    }
-    intvec operator^(intvec x) const
-    {
+  }
+  intvec operator^(intvec x) const {
 #ifdef __AVX2__
-      return _mm256_xor_si256(v, x.v);
+    return _mm256_xor_si256(v, x.v);
 #else
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
+    return _mm256_castps_si256(
+        _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
 #endif
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const
-    {
+  }
+
+  intvec &operator&=(intvec const &x) { return *this = *this & x; }
+  intvec &operator|=(intvec const &x) { return *this = *this | x; }
+  intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+  intvec lsr(int_t n) const {
 #ifdef __AVX2__
-      return _mm256_srli_epi16(v, n);
+    return _mm256_srli_epi16(v, n);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srli_epi16(vlo, n);
-      vhi = _mm_srli_epi16(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_srli_epi16(vlo, n);
+    vhi = _mm_srli_epi16(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec operator>>(int_t n) const
-    {
+  }
+  intvec operator>>(int_t n) const {
 #ifdef __AVX2__
-      return _mm256_srai_epi16(v, n);
+    return _mm256_srai_epi16(v, n);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_srai_epi16(vlo, n);
-      vhi = _mm_srai_epi16(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_srai_epi16(vlo, n);
+    vhi = _mm_srai_epi16(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec operator<<(int_t n) const
-    {
+  }
+  intvec operator<<(int_t n) const {
 #ifdef __AVX2__
-      return _mm256_slli_epi16(v, n);
+    return _mm256_slli_epi16(v, n);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      vlo = _mm_slli_epi16(vlo, n);
-      vhi = _mm_slli_epi16(vhi, n);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    vlo = _mm_slli_epi16(vlo, n);
+    vhi = _mm_slli_epi16(vhi, n);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
+  }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec lsr(intvec n) const {
 #ifdef __AVX2__
-      // TODO: Use permute instead of shift/mask?
-      _mm256i mlo = _mm256_set1_epi32(U(0x0000ffff));
-      _mm256i vlo = _mm256_and_si256(mlo, v);
-      _mm256i vhi = v;
-      _mm256i clo = _mm256_and_si256(mlo, n);
-      _mm256i chi = _mm256_and_si256(mlo, _mm256_srli_epi32(n, 16));
-      _mm256i rlo = _mm256_srlv_epi32(vlo, clo);
-      _mm256i rhi = _mm256_andnot_si256(mlo, _mm256_srlv_epi32(vhi, chi));
-      return _mm256_or_si256(rhi, rlo);
+    // TODO: Use permute instead of shift/mask?
+    _mm256i mlo = _mm256_set1_epi32(U(0x0000ffff));
+    _mm256i vlo = _mm256_and_si256(mlo, v);
+    _mm256i vhi = v;
+    _mm256i clo = _mm256_and_si256(mlo, n);
+    _mm256i chi = _mm256_and_si256(mlo, _mm256_srli_epi32(n, 16));
+    _mm256i rlo = _mm256_srlv_epi32(vlo, clo);
+    _mm256i rhi = _mm256_andnot_si256(mlo, _mm256_srlv_epi32(vhi, chi));
+    return _mm256_or_si256(rhi, rlo);
 #else
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-#endif
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    intvec operator>>(intvec n) const
-    {
+    return r;
+#endif
+  }
+  intvec operator>>(intvec n) const {
 #ifdef __AVX2__
-      intvec_t offset = U(1) << (bits-1);
-      return (*this + offset).lsr(n) - offset.lsr(n);
+    intvec_t offset = U(1) << (bits - 1);
+    return (*this + offset).lsr(n) - offset.lsr(n);
 #else
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-#endif
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    intvec operator<<(intvec n) const
-    {
+    return r;
+#endif
+  }
+  intvec operator<<(intvec n) const {
 #ifdef __AVX2__
-      // TODO: Use permute instead of shift/mask?
-      _mm256i mlo = _mm256_set1_epi32(U(0x0000ffff));
-      _mm256i vlo = v;
+    // TODO: Use permute instead of shift/mask?
+    _mm256i mlo = _mm256_set1_epi32(U(0x0000ffff));
+    _mm256i vlo = v;
       _mm256i vhi = _mm256_andnot_si256(mlo, v;
       _mm256i clo = _mm256_and_si256(mlo, n);
       _mm256i chi = _mm256_and_si256(mlo, _mm256_srli_epi32(n, 16));
@@ -391,338 +322,274 @@ namespace vecmathlib {
       _mm256i rhi = _mm256_sllv_epi32(vhi, chi);
       return _mm256_or_si256(rhi, rlo);
 #else
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-#endif
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const
-    {
+    return r;
+#endif
+  }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  boolvec_t operator==(intvec const &x) const {
 #ifdef __AVX2__
-      return _mm256_cmpeq_epi16(v, x.v);
+    return _mm256_cmpeq_epi16(v, x.v);
 #else
-      return ! (*this != x);
+    return !(*this != x);
 #endif
-    }
-    boolvec_t operator!=(intvec const& x) const
-    {
+  }
+  boolvec_t operator!=(intvec const &x) const {
 #ifdef __AVX2__
-      return ! (*this == x);
+    return !(*this == x);
 #else
-      return (*this ^ x).convert_bool();
+    return (*this ^ x).convert_bool();
 #endif
-    }
-    boolvec_t operator<(intvec const& x) const
-    {
+  }
+  boolvec_t operator<(intvec const &x) const {
 #ifdef __AVX2__
-      return _mm256_cmpgt_epi16(x.v, v);
+    return _mm256_cmpgt_epi16(x.v, v);
 #else
-      // TODO: First compare sign; then if equal, compare sign of difference
-      // TODO: Also look for intrinsics
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-#endif
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      return x < *this;
+    // TODO: First compare sign; then if equal, compare sign of difference
+    // TODO: Also look for intrinsics
+    boolvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] < x[i]);
     }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      return ! (*this < x);
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const { return as_bool(); }
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<fp16,16>: floatprops<fp16>
-  {
-    static int const size = 16;
-    typedef real_t scalar_t;
-    typedef __m256i vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() {
+    return r;
+#endif
+  }
+  boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<fp16, 16> : floatprops<fp16> {
+  static int const size = 16;
+  typedef real_t scalar_t;
+  typedef __m256i vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() {
 #ifdef __AVX2__
-      return "<AVX2:16*fp16>";
+    return "<AVX2:16*fp16>";
 #else
-      return "<AVX:16*fp16>";
+    return "<AVX:16*fp16>";
 #endif
+  }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm256_set1_epi16(FP::as_int(a))) {}
+  realvec(real_t const *as)
+      : v(_mm256_set_epi16(
+            FP::as_int(as[15]), FP::as_int(as[14]), FP::as_int(as[13]),
+            FP::as_int(as[12]), FP::as_int(as[11]), FP::as_int(as[10]),
+            FP::as_int(as[9]), FP::as_int(as[8]), FP::as_int(as[7]),
+            FP::as_int(as[6]), FP::as_int(as[5]), FP::as_int(as[4]),
+            FP::as_int(as[3]), FP::as_int(as[2]), FP::as_int(as[1]),
+            FP::as_int(as[0]))) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm256_load_si256((__m256i const *)p);
+  }
+  static realvec_t loadu(real_t const *p) {
+    return _mm256_loadu_si256((__m256i const *)p);
+  }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm256_set1_epi16(FP::as_int(a))) {}
-    realvec(real_t const* as):
-    v(_mm256_set_epi16(FP::as_int(as[15]),
-                       FP::as_int(as[14]),
-                       FP::as_int(as[13]),
-                       FP::as_int(as[12]),
-                       FP::as_int(as[11]),
-                       FP::as_int(as[10]),
-                       FP::as_int(as[ 9]),
-                       FP::as_int(as[ 8]),
-                       FP::as_int(as[ 7]),
-                       FP::as_int(as[ 6]),
-                       FP::as_int(as[ 5]),
-                       FP::as_int(as[ 4]),
-                       FP::as_int(as[ 3]),
-                       FP::as_int(as[ 2]),
-                       FP::as_int(as[ 1]),
-                       FP::as_int(as[ 0]))) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm256_load_si256((__m256i const*)p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm256_loadu_si256((__m256i const*)p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm256_store_si256((__m256i*)p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm256_storeu_si256((__m256i*)p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    
-    
-    
-    intvec_t as_int() const { return v; }
-    intvec_t convert_int() const { __builtin_unreachable(); }
-    
-    
-    
-    realvec operator+() const { __builtin_unreachable(); }
-    realvec operator-() const { __builtin_unreachable(); }
-    
-    realvec operator+(realvec x) const { __builtin_unreachable(); }
-    realvec operator-(realvec x) const { __builtin_unreachable(); }
-    realvec operator*(realvec x) const { __builtin_unreachable(); }
-    realvec operator/(realvec x) const { __builtin_unreachable(); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t maxval() const { __builtin_unreachable(); }
-    real_t minval() const { __builtin_unreachable(); }
-    real_t prod() const { __builtin_unreachable(); }
-    real_t sum() const { __builtin_unreachable(); }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
-    
-    
-    
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec fabs() const { return MF::vml_fabs(*this); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    boolvec_t signbit() const { return v; }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<fp16,16> boolvec<fp16,16>::as_int() const
-  {
-    return v;
-  }
-  
-  inline intvec<fp16,16> boolvec<fp16,16>::convert_int() const
-  {
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  boolvec<fp16,16> boolvec<fp16,16>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline intvec<fp16,16> boolvec<fp16,16>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return (( -convert_int() & x) | (~-convert_int() & y));
-  }
-  
-  inline
-  realvec<fp16,16> boolvec<fp16,16>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_float();
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline intvec<fp16,16> intvec<fp16,16>::abs() const
-  {
-#ifdef __AVX2__
-    return _mm256_abs_epi16(v);
-#else
-    return MF::vml_abs(*this);
-#endif
   }
-  
-  inline realvec<fp16,16> intvec<fp16,16>::as_float() const
-  {
-    return v;
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm256_store_si256((__m256i *)p, v);
   }
-  
-  inline realvec<fp16,16> intvec<fp16,16>::convert_float() const
-  {
-    __builtin_unreachable();
+  void storeu(real_t *p) const { return _mm256_storeu_si256((__m256i *)p, v); }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      // TODO: this is expensive
+      for (int n = 0; n < size; ++n)
+        if (m.m[n])
+          p[n] = (*this)[n];
+    }
   }
-  
-  inline intvec<fp16,16> intvec<fp16,16>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      // TODO: this is expensive
+      for (int n = 0; n < size; ++n)
+        if (m.m[n])
+          p[n] = (*this)[n];
+    }
   }
-  
-  inline intvec<fp16,16> intvec<fp16,16>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
   }
-  
+
+  intvec_t as_int() const { return v; }
+  intvec_t convert_int() const { __builtin_unreachable(); }
+
+  realvec operator+() const { __builtin_unreachable(); }
+  realvec operator-() const { __builtin_unreachable(); }
+
+  realvec operator+(realvec x) const { __builtin_unreachable(); }
+  realvec operator-(realvec x) const { __builtin_unreachable(); }
+  realvec operator*(realvec x) const { __builtin_unreachable(); }
+  realvec operator/(realvec x) const { __builtin_unreachable(); }
+
+  realvec &operator+=(realvec const &x) { return *this = *this + x; }
+  realvec &operator-=(realvec const &x) { return *this = *this - x; }
+  realvec &operator*=(realvec const &x) { return *this = *this * x; }
+  realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+  real_t maxval() const { __builtin_unreachable(); }
+  real_t minval() const { __builtin_unreachable(); }
+  real_t prod() const { __builtin_unreachable(); }
+  real_t sum() const { __builtin_unreachable(); }
+
+  boolvec_t operator==(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator!=(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator<(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator<=(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator>(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator>=(realvec const &x) const { __builtin_unreachable(); }
+
+  realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+  realvec fabs() const { return MF::vml_fabs(*this); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  boolvec_t signbit() const { return v; }
+};
+
+// boolvec definitions
+
+inline intvec<fp16, 16> boolvec<fp16, 16>::as_int() const { return v; }
+
+inline intvec<fp16, 16> boolvec<fp16, 16>::convert_int() const {
+  return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<fp16, 16> boolvec<fp16, 16>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<fp16, 16> boolvec<fp16, 16>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return ((-convert_int() & x) | (~ - convert_int() & y));
+}
+
+inline realvec<fp16, 16> boolvec<fp16, 16>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_float();
+}
+
+// intvec definitions
+
+inline intvec<fp16, 16> intvec<fp16, 16>::abs() const {
+#ifdef __AVX2__
+  return _mm256_abs_epi16(v);
+#else
+  return MF::vml_abs(*this);
+#endif
+}
+
+inline realvec<fp16, 16> intvec<fp16, 16>::as_float() const { return v; }
+
+inline realvec<fp16, 16> intvec<fp16, 16>::convert_float() const {
+  __builtin_unreachable();
+}
+
+inline intvec<fp16, 16> intvec<fp16, 16>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<fp16, 16> intvec<fp16, 16>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_AVX_FP16_16_H
+#endif // #ifndef VEC_AVX_FP16_16_H
diff --git a/vec_avx_fp8_32.h b/vec_avx_fp8_32.h
index 912bd19..0ae79e7 100644
--- a/vec_avx_fp8_32.h
+++ b/vec_avx_fp8_32.h
@@ -12,763 +12,592 @@
 // AVX intrinsics
 #include <immintrin.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FP8_32
-  template<> struct boolvec<fp8,32>;
-  template<> struct intvec<fp8,32>;
-  template<> struct realvec<fp8,32>;
-  
-  
-  
-  template<>
-  struct boolvec<fp8,32>: floatprops<fp8>
-  {
-    static int const size = 32;
-    typedef bool scalar_t;
-    typedef __m256i bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(_mm256_set1_epi8(from_bool(a))) {}
-    boolvec(bool const* as):
-    v(_mm256_set_epi8(from_bool(as[31]),
-                      from_bool(as[30]),
-                      from_bool(as[29]),
-                      from_bool(as[28]),
-                      from_bool(as[27]),
-                      from_bool(as[26]),
-                      from_bool(as[25]),
-                      from_bool(as[24]),
-                      from_bool(as[23]),
-                      from_bool(as[22]),
-                      from_bool(as[21]),
-                      from_bool(as[20]),
-                      from_bool(as[19]),
-                      from_bool(as[18]),
-                      from_bool(as[17]),
-                      from_bool(as[16]),
-                      from_bool(as[15]),
-                      from_bool(as[14]),
-                      from_bool(as[13]),
-                      from_bool(as[12]),
-                      from_bool(as[11]),
-                      from_bool(as[10]),
-                      from_bool(as[ 9]),
-                      from_bool(as[ 8]),
-                      from_bool(as[ 7]),
-                      from_bool(as[ 6]),
-                      from_bool(as[ 5]),
-                      from_bool(as[ 4]),
-                      from_bool(as[ 3]),
-                      from_bool(as[ 2]),
-                      from_bool(as[ 1]),
-                      from_bool(as[ 0]))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return *this != boolvec(true); }
-    
-    boolvec operator&&(boolvec x) const 
-    {
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    boolvec operator||(boolvec x) const
-    {
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
-    }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const
-    {
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
-    }
-    
-    bool all() const
-    {
-      bool r = (*this)[0];
-      for (int n=1; n<size; ++n) r = r && (*this)[n];
-      return r;
-    }
-    bool any() const
-    {
-      bool r = (*this)[0];;
-      for (int n=1; n<size; ++n) r = r || (*this)[n];
-      return r;
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<fp8,32>: floatprops<fp8>
-  {
-    static int const size = 32;
-    typedef int_t scalar_t;
-    typedef __m256i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm256_set1_epi8(a)) {}
-    intvec(int_t const* as):
-    v(_mm256_set_epi8(as[31],
-                      as[30],
-                      as[29],
-                      as[28],
-                      as[27],
-                      as[26],
-                      as[25],
-                      as[24],
-                      as[23],
-                      as[22],
-                      as[21],
-                      as[20],
-                      as[19],
-                      as[18],
-                      as[17],
-                      as[16],
-                      as[15],
-                      as[14],
-                      as[13],
-                      as[12],
-                      as[11],
-                      as[10],
-                      as[ 9],
-                      as[ 8],
-                      as[ 7],
-                      as[ 6],
-                      as[ 5],
-                      as[ 4],
-                      as[ 3],
-                      as[ 2],
-                      as[ 1],
-                      as[ 0])) {}
-    static intvec iota()
-    {
-      return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24,
-                             23, 22, 21, 20, 19, 18, 17, 16,
-                             15, 14, 13, 12, 11, 10, 9, 8,
-                             7, 6, 5, 4, 3, 2, 1, 0);
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    boolvec_t as_bool() const { return v; }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare to zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec x = *this;
-      // We know that boolvec values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(I(0)) - *this; }
-    
-    intvec operator+(intvec x) const
-    {
+template <> struct boolvec<fp8, 32>;
+template <> struct intvec<fp8, 32>;
+template <> struct realvec<fp8, 32>;
+
+template <> struct boolvec<fp8, 32> : floatprops<fp8> {
+  static int const size = 32;
+  typedef bool scalar_t;
+  typedef __m256i bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values have the sign bit set, false values have it unset
+  static uint_t from_bool(bool a) { return -uint_t(a); }
+  static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(_mm256_set1_epi8(from_bool(a))) {}
+  boolvec(bool const *as)
+      : v(_mm256_set_epi8(
+            from_bool(as[31]), from_bool(as[30]), from_bool(as[29]),
+            from_bool(as[28]), from_bool(as[27]), from_bool(as[26]),
+            from_bool(as[25]), from_bool(as[24]), from_bool(as[23]),
+            from_bool(as[22]), from_bool(as[21]), from_bool(as[20]),
+            from_bool(as[19]), from_bool(as[18]), from_bool(as[17]),
+            from_bool(as[16]), from_bool(as[15]), from_bool(as[14]),
+            from_bool(as[13]), from_bool(as[12]), from_bool(as[11]),
+            from_bool(as[10]), from_bool(as[9]), from_bool(as[8]),
+            from_bool(as[7]), from_bool(as[6]), from_bool(as[5]),
+            from_bool(as[4]), from_bool(as[3]), from_bool(as[2]),
+            from_bool(as[1]), from_bool(as[0]))) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return *this != boolvec(true); }
+
+  boolvec operator&&(boolvec x) const {
+    return _mm256_castps_si256(
+        _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+  }
+  boolvec operator||(boolvec x) const {
+    return _mm256_castps_si256(
+        _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+  }
+  boolvec operator==(boolvec x) const { return !(*this != x); }
+  boolvec operator!=(boolvec x) const {
+    return _mm256_castps_si256(
+        _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+  }
+
+  bool all() const {
+    bool r = (*this)[0];
+    for (int n = 1; n < size; ++n)
+      r = r && (*this)[n];
+    return r;
+  }
+  bool any() const {
+    bool r = (*this)[0];
+    ;
+    for (int n = 1; n < size; ++n)
+      r = r || (*this)[n];
+    return r;
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<fp8, 32> : floatprops<fp8> {
+  static int const size = 32;
+  typedef int_t scalar_t;
+  typedef __m256i ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm256_set1_epi8(a)) {}
+  intvec(int_t const *as)
+      : v(_mm256_set_epi8(as[31], as[30], as[29], as[28], as[27], as[26],
+                          as[25], as[24], as[23], as[22], as[21], as[20],
+                          as[19], as[18], as[17], as[16], as[15], as[14],
+                          as[13], as[12], as[11], as[10], as[9], as[8], as[7],
+                          as[6], as[5], as[4], as[3], as[2], as[1], as[0])) {}
+  static intvec iota() {
+    return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19,
+                           18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4,
+                           3, 2, 1, 0);
+  }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  boolvec_t as_bool() const { return v; }
+  boolvec_t convert_bool() const {
+    // Result: convert_bool(0)=false, convert_bool(else)=true
+    // There is no intrinsic to compare to zero. Instead, we check
+    // whether x is positive and x-1 is negative.
+    intvec x = *this;
+    // We know that boolvec values depend only on the sign bit
+    // return (~(x-1) | x).as_bool();
+    // return x.as_bool() || !(x-1).as_bool();
+    return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const { return IV(I(0)) - *this; }
+
+  intvec operator+(intvec x) const {
 #ifdef __AVX2__
-      return _mm256_add_epi8(v, x.v);
+    return _mm256_add_epi8(v, x.v);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_add_epi8(vlo, xvlo);
-      vhi = _mm_add_epi8(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_add_epi8(vlo, xvlo);
+    vhi = _mm_add_epi8(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec operator-(intvec x) const
-    {
+  }
+  intvec operator-(intvec x) const {
 #ifdef __AVX2__
-      return _mm256_sub_epi8(v, x.v);
+    return _mm256_sub_epi8(v, x.v);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      __m128i xvlo = _mm256_castsi256_si128(x.v);
-      __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
-      vlo = _mm_sub_epi8(vlo, xvlo);
-      vhi = _mm_sub_epi8(vhi, xvhi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    __m128i xvlo = _mm256_castsi256_si128(x.v);
+    __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+    vlo = _mm_sub_epi8(vlo, xvlo);
+    vhi = _mm_sub_epi8(vhi, xvhi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec operator&(intvec x) const
-    {
+  }
+
+  intvec &operator+=(intvec const &x) { return *this = *this + x; }
+  intvec &operator-=(intvec const &x) { return *this = *this - x; }
+
+  intvec operator~() const { return IV(~U(0)) ^ *this; }
+
+  intvec operator&(intvec x) const {
 #ifdef __AVX2__
-      return _mm256_and_si256(v, x.v);
+    return _mm256_and_si256(v, x.v);
 #else
-      return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
+    return _mm256_castps_si256(
+        _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
 #endif
-    }
-    intvec operator|(intvec x) const
-    {
+  }
+  intvec operator|(intvec x) const {
 #ifdef __AVX2__
-      return _mm256_or_si256(v, x.v);
+    return _mm256_or_si256(v, x.v);
 #else
-      return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
-                                              _mm256_castsi256_ps(x.v)));
+    return _mm256_castps_si256(
+        _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
 #endif
-    }
-    intvec operator^(intvec x) const
-    {
+  }
+  intvec operator^(intvec x) const {
 #ifdef __AVX2__
-      return _mm256_xor_si256(v, x.v);
+    return _mm256_xor_si256(v, x.v);
 #else
-      return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
-                                               _mm256_castsi256_ps(x.v)));
+    return _mm256_castps_si256(
+        _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
 #endif
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    
-    
-    intvec lsr(int_t n) const
-    {
+  }
+
+  intvec &operator&=(intvec const &x) { return *this = *this & x; }
+  intvec &operator|=(intvec const &x) { return *this = *this | x; }
+  intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+  intvec lsr(int_t n) const {
 #ifdef __AVX2__
-      uint_t masklo = U(0x00ffU) >> U(n);
-      uint_t maskhi = U(0xff00U);
-      intvec mask = masklo | maskhi;
-      return intvec(_mm256_srai_epi16(v, n)) & mask;
+    uint_t masklo = U(0x00ffU) >> U(n);
+    uint_t maskhi = U(0xff00U);
+    intvec mask = masklo | maskhi;
+    return intvec(_mm256_srai_epi16(v, n)) & mask;
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      uint_t masklo = U(0x00ffU) >> U(n);
-      uint_t maskhi = U(0xff00U);
-      __m128i mask = _mm_set1_epi16(masklo | maskhi);
-      vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask);
-      vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    uint_t masklo = U(0x00ffU) >> U(n);
+    uint_t maskhi = U(0xff00U);
+    __m128i mask = _mm_set1_epi16(masklo | maskhi);
+    vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask);
+    vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec operator>>(int_t n) const
-    {
+  }
+  intvec operator>>(int_t n) const {
 #ifdef __AVX2__
-      // There is no _mm256_srai_epi8. To emulate it, add 0x80 before
-      // shifting, and subtract the shifted 0x80 after shifting
-      intvec_t offset = U(1) << (bits-1);
-      return (*this + offset).lsr(n) - offset.lsr(n);
+    // There is no _mm256_srai_epi8. To emulate it, add 0x80 before
+    // shifting, and subtract the shifted 0x80 after shifting
+    intvec_t offset = U(1) << (bits - 1);
+    return (*this + offset).lsr(n) - offset.lsr(n);
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      uint_t masklo = U(0x00ffU);
-      uint_t maskhi = U(0xff00U);
-      __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n+8),
-                                    _mm_set1_epi16(masklo));
-      __m128i vlohi = _mm_and_si128(_mm_srai_epi16(vlo, n),
-                                    _mm_set1_epi16(maskhi));
-      vlo = _mm_or_si128(vlolo, vlohi);
-      __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n+8),
-                                    _mm_set1_epi16(masklo));
-      __m128i vhihi = _mm_and_si128(_mm_srai_epi16(vhi, n),
-                                    _mm_set1_epi16(maskhi));
-      vhi = _mm_or_si128(vhilo, vhihi);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    uint_t masklo = U(0x00ffU);
+    uint_t maskhi = U(0xff00U);
+    __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n + 8),
+                                  _mm_set1_epi16(masklo));
+    __m128i vlohi =
+        _mm_and_si128(_mm_srai_epi16(vlo, n), _mm_set1_epi16(maskhi));
+    vlo = _mm_or_si128(vlolo, vlohi);
+    __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n + 8),
+                                  _mm_set1_epi16(masklo));
+    __m128i vhihi =
+        _mm_and_si128(_mm_srai_epi16(vhi, n), _mm_set1_epi16(maskhi));
+    vhi = _mm_or_si128(vhilo, vhihi);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
-    }
-    intvec operator<<(int_t n) const
-    {
+  }
+  intvec operator<<(int_t n) const {
 #ifdef __AVX2__
-      uint_t masklo = U(0x00ffU);
-      uint_t maskhi = U(0xff00U) << U(n);
-      intvec mask = masklo | maskhi;
-      return intvec(_mm256_slli_epi16(v, n)) & mask;
+    uint_t masklo = U(0x00ffU);
+    uint_t maskhi = U(0xff00U) << U(n);
+    intvec mask = masklo | maskhi;
+    return intvec(_mm256_slli_epi16(v, n)) & mask;
 #else
-      __m128i vlo = _mm256_castsi256_si128(v);
-      __m128i vhi = _mm256_extractf128_si256(v, 1);
-      uint_t masklo = U(0x00ffU);
-      uint_t maskhi = U(0xff00U) << U(n);
-      __m128i mask = _mm_set1_epi16(masklo | maskhi);
-      vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask);
-      vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask);
-      return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+    __m128i vlo = _mm256_castsi256_si128(v);
+    __m128i vhi = _mm256_extractf128_si256(v, 1);
+    uint_t masklo = U(0x00ffU);
+    uint_t maskhi = U(0xff00U) << U(n);
+    __m128i mask = _mm_set1_epi16(masklo | maskhi);
+    vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask);
+    vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
 #endif
+  }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec lsr(intvec n) const {
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec operator>>(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
+    return r;
+  }
+  intvec operator>>(intvec n) const {
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    intvec operator<<(intvec n) const
-    {
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
+    return r;
+  }
+  intvec operator<<(intvec n) const {
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const
-    {
+    return r;
+  }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  boolvec_t operator==(intvec const &x) const {
 #ifdef __AVX2__
-      return _mm256_cmpeq_epi8(v, x.v);
+    return _mm256_cmpeq_epi8(v, x.v);
 #else
-      return ! (*this != x);
+    return !(*this != x);
 #endif
-    }
-    boolvec_t operator!=(intvec const& x) const
-    {
+  }
+  boolvec_t operator!=(intvec const &x) const {
 #ifdef __AVX2__
-      return ! (*this == x);
+    return !(*this == x);
 #else
-      return (*this ^ x).convert_bool();
+    return (*this ^ x).convert_bool();
 #endif
-    }
-    boolvec_t operator<(intvec const& x) const
-    {
+  }
+  boolvec_t operator<(intvec const &x) const {
 #ifdef __AVX2__
-      return _mm256_cmpgt_epi8(x.v, v);
+    return _mm256_cmpgt_epi8(x.v, v);
 #else
-      // TODO: First compare sign; then if equal, compare sign of difference
-      // TODO: Also look for intrinsics
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-#endif
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      return x < *this;
+    // TODO: First compare sign; then if equal, compare sign of difference
+    // TODO: Also look for intrinsics
+    boolvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] < x[i]);
     }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      return ! (*this < x);
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const { return as_bool(); }
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<fp8,32>: floatprops<fp8>
-  {
-    static int const size = 32;
-    typedef real_t scalar_t;
-    typedef __m256i vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() {
+    return r;
+#endif
+  }
+  boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<fp8, 32> : floatprops<fp8> {
+  static int const size = 32;
+  typedef real_t scalar_t;
+  typedef __m256i vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() {
 #ifdef __AVX2__
-      return "<AVX2:32*fp8>";
+    return "<AVX2:32*fp8>";
 #else
-      return "<AVX:32*fp8>";
+    return "<AVX:32*fp8>";
 #endif
+  }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm256_set1_epi8(FP::as_int(a))) {}
+  realvec(real_t const *as)
+      : v(_mm256_set_epi8(
+            FP::as_int(as[31]), FP::as_int(as[30]), FP::as_int(as[29]),
+            FP::as_int(as[28]), FP::as_int(as[27]), FP::as_int(as[26]),
+            FP::as_int(as[25]), FP::as_int(as[24]), FP::as_int(as[23]),
+            FP::as_int(as[22]), FP::as_int(as[21]), FP::as_int(as[20]),
+            FP::as_int(as[19]), FP::as_int(as[18]), FP::as_int(as[17]),
+            FP::as_int(as[16]), FP::as_int(as[15]), FP::as_int(as[14]),
+            FP::as_int(as[13]), FP::as_int(as[12]), FP::as_int(as[11]),
+            FP::as_int(as[10]), FP::as_int(as[9]), FP::as_int(as[8]),
+            FP::as_int(as[7]), FP::as_int(as[6]), FP::as_int(as[5]),
+            FP::as_int(as[4]), FP::as_int(as[3]), FP::as_int(as[2]),
+            FP::as_int(as[1]), FP::as_int(as[0]))) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm256_load_si256((__m256i const *)p);
+  }
+  static realvec_t loadu(real_t const *p) {
+    return _mm256_loadu_si256((__m256i const *)p);
+  }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm256_set1_epi8(FP::as_int(a))) {}
-    realvec(real_t const* as):
-    v(_mm256_set_epi8(FP::as_int(as[31]),
-                      FP::as_int(as[30]),
-                      FP::as_int(as[29]),
-                      FP::as_int(as[28]),
-                      FP::as_int(as[27]),
-                      FP::as_int(as[26]),
-                      FP::as_int(as[25]),
-                      FP::as_int(as[24]),
-                      FP::as_int(as[23]),
-                      FP::as_int(as[22]),
-                      FP::as_int(as[21]),
-                      FP::as_int(as[20]),
-                      FP::as_int(as[19]),
-                      FP::as_int(as[18]),
-                      FP::as_int(as[17]),
-                      FP::as_int(as[16]),
-                      FP::as_int(as[15]),
-                      FP::as_int(as[14]),
-                      FP::as_int(as[13]),
-                      FP::as_int(as[12]),
-                      FP::as_int(as[11]),
-                      FP::as_int(as[10]),
-                      FP::as_int(as[ 9]),
-                      FP::as_int(as[ 8]),
-                      FP::as_int(as[ 7]),
-                      FP::as_int(as[ 6]),
-                      FP::as_int(as[ 5]),
-                      FP::as_int(as[ 4]),
-                      FP::as_int(as[ 3]),
-                      FP::as_int(as[ 2]),
-                      FP::as_int(as[ 1]),
-                      FP::as_int(as[ 0]))) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm256_load_si256((__m256i const*)p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm256_loadu_si256((__m256i const*)p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm256_store_si256((__m256i*)p, v);
-    }
-    void storeu(real_t* p) const
-    {
-      return _mm256_storeu_si256((__m256i*)p, v);
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // TODO: this is expensive
-        for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    
-    
-    
-    intvec_t as_int() const { return v; }
-    intvec_t convert_int() const { __builtin_unreachable(); }
-    
-    
-    
-    realvec operator+() const { __builtin_unreachable(); }
-    realvec operator-() const { __builtin_unreachable(); }
-    
-    realvec operator+(realvec x) const { __builtin_unreachable(); }
-    realvec operator-(realvec x) const { __builtin_unreachable(); }
-    realvec operator*(realvec x) const { __builtin_unreachable(); }
-    realvec operator/(realvec x) const { __builtin_unreachable(); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t maxval() const { __builtin_unreachable(); }
-    real_t minval() const { __builtin_unreachable(); }
-    real_t prod() const { __builtin_unreachable(); }
-    real_t sum() const { __builtin_unreachable(); }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
-    boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
-    
-    
-    
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec fabs() const { return MF::vml_fabs(*this); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    boolvec_t signbit() const { return v; }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<fp8,32> boolvec<fp8,32>::as_int() const
-  {
-    return v;
-  }
-  
-  inline intvec<fp8,32> boolvec<fp8,32>::convert_int() const
-  {
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  boolvec<fp8,32> boolvec<fp8,32>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline intvec<fp8,32> boolvec<fp8,32>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return (( -convert_int() & x) | (~-convert_int() & y));
-  }
-  
-  inline
-  realvec<fp8,32> boolvec<fp8,32>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_float();
-  }
-
-  
-  
-  // intvec definitions
-  
-  inline intvec<fp8,32> intvec<fp8,32>::abs() const
-  {
-#ifdef __AVX2__
-    return _mm256_abs_epi8(v);
-#else
-    return MF::vml_abs(*this);
-#endif
   }
-  
-  inline realvec<fp8,32> intvec<fp8,32>::as_float() const
-  {
-    return v;
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm256_store_si256((__m256i *)p, v);
   }
-  
-  inline realvec<fp8,32> intvec<fp8,32>::convert_float() const
-  {
-    __builtin_unreachable();
+  void storeu(real_t *p) const { return _mm256_storeu_si256((__m256i *)p, v); }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
   }
-  
-  inline intvec<fp8,32> intvec<fp8,32>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      // TODO: this is expensive
+      for (int n = 0; n < size; ++n)
+        if (m.m[n])
+          p[n] = (*this)[n];
+    }
   }
-  
-  inline intvec<fp8,32> intvec<fp8,32>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      // TODO: this is expensive
+      for (int n = 0; n < size; ++n)
+        if (m.m[n])
+          p[n] = (*this)[n];
+    }
   }
-  
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return v; }
+  intvec_t convert_int() const { __builtin_unreachable(); }
+
+  realvec operator+() const { __builtin_unreachable(); }
+  realvec operator-() const { __builtin_unreachable(); }
+
+  realvec operator+(realvec x) const { __builtin_unreachable(); }
+  realvec operator-(realvec x) const { __builtin_unreachable(); }
+  realvec operator*(realvec x) const { __builtin_unreachable(); }
+  realvec operator/(realvec x) const { __builtin_unreachable(); }
+
+  realvec &operator+=(realvec const &x) { return *this = *this + x; }
+  realvec &operator-=(realvec const &x) { return *this = *this - x; }
+  realvec &operator*=(realvec const &x) { return *this = *this * x; }
+  realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+  real_t maxval() const { __builtin_unreachable(); }
+  real_t minval() const { __builtin_unreachable(); }
+  real_t prod() const { __builtin_unreachable(); }
+  real_t sum() const { __builtin_unreachable(); }
+
+  boolvec_t operator==(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator!=(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator<(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator<=(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator>(realvec const &x) const { __builtin_unreachable(); }
+  boolvec_t operator>=(realvec const &x) const { __builtin_unreachable(); }
+
+  realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+  realvec fabs() const { return MF::vml_fabs(*this); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  boolvec_t signbit() const { return v; }
+};
+
+// boolvec definitions
+
+inline intvec<fp8, 32> boolvec<fp8, 32>::as_int() const { return v; }
+
+inline intvec<fp8, 32> boolvec<fp8, 32>::convert_int() const {
+  return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<fp8, 32> boolvec<fp8, 32>::ifthen(boolvec_t x,
+                                                 boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<fp8, 32> boolvec<fp8, 32>::ifthen(intvec_t x, intvec_t y) const {
+  return ((-convert_int() & x) | (~ - convert_int() & y));
+}
+
+inline realvec<fp8, 32> boolvec<fp8, 32>::ifthen(realvec_t x,
+                                                 realvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_float();
+}
+
+// intvec definitions
+
+inline intvec<fp8, 32> intvec<fp8, 32>::abs() const {
+#ifdef __AVX2__
+  return _mm256_abs_epi8(v);
+#else
+  return MF::vml_abs(*this);
+#endif
+}
+
+inline realvec<fp8, 32> intvec<fp8, 32>::as_float() const { return v; }
+
+inline realvec<fp8, 32> intvec<fp8, 32>::convert_float() const {
+  __builtin_unreachable();
+}
+
+inline intvec<fp8, 32> intvec<fp8, 32>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<fp8, 32> intvec<fp8, 32>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_AVX_FP8_32_H
+#endif // #ifndef VEC_AVX_FP8_32_H
diff --git a/vec_base.h b/vec_base.h
index 737a1e0..81c698d 100644
--- a/vec_base.h
+++ b/vec_base.h
@@ -4,663 +4,544 @@
 #define VEC_BASE_H
 
 #ifndef VML_NO_IOSTREAM
-#  include <iostream>
+#include <iostream>
 #endif
 
 #include "vec_mask.h"
 
+namespace vecmathlib {
 
+template <typename real_t, int size> struct boolvec {};
 
-namespace vecmathlib {
-  
-  template<typename real_t, int size>
-  struct boolvec {
-  };
-  
-  template<typename real_t, int size>
-  struct intvec {
-  };
-  
-  template<typename real_t, int size>
-  struct realvec {
-  };
-  
-
-  
-  // boolvec wrappers
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> as_int(boolvec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> convert_int(boolvec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline bool all(boolvec<real_t, size> x) { return x.all(); }
-  
-  template<typename real_t, int size>
-  inline bool any(boolvec<real_t, size> x) { return x.any(); }
-  
-  template<typename real_t, int size>
-  inline
-  boolvec<real_t, size> ifthen(boolvec<real_t, size> c,
-                               boolvec<real_t, size> x,
-                               boolvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intvec<real_t, size> ifthen(boolvec<real_t, size> c,
-                              intvec<real_t, size> x,
-                              intvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realvec<real_t, size> ifthen(boolvec<real_t, size> c,
-                               realvec<real_t, size> x,
-                               realvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  
-  
-  // intvec wrappers
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> as_bool(intvec<real_t, size> x)
-  {
-    return x.as_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> convert_bool(intvec<real_t, size> x)
-  {
-    return x.convert_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> as_float(intvec<real_t, size> x)
-  {
-    return x.as_float();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> convert_float(intvec<real_t, size> x)
-  {
-    return x.convert_float();
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> abs(intvec<real_t, size> x)
-  {
-    return x.abs();
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> bitifthen(intvec<real_t, size> x,
-                                        intvec<real_t, size> y,
-                                        intvec<real_t, size> z)
-  {
-    return x.bitifthen(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> clz(intvec<real_t, size> x)
-  {
-    return x.clz();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> isignbit(intvec<real_t, size> x)
-  {
-    return x.isignbit();
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> lsr(intvec<real_t, size> x,
-                                  typename intvec<real_t, size>::int_t n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> lsr(intvec<real_t, size> x,
-                                  intvec<real_t, size> n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> max(intvec<real_t, size> x,
-                                  intvec<real_t, size> y)
-  {
-    return x.max(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> min(intvec<real_t, size> x,
-                                  intvec<real_t, size> y)
-  {
-    return x.min(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> popcount(intvec<real_t, size> x)
-  {
-    return x.popcount();
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> rotate(intvec<real_t, size> x,
-                                     typename intvec<real_t, size>::int_t n)
-  {
-    return x.rotate(n);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> rotate(intvec<real_t, size> x,
-                                     intvec<real_t, size> n)
-  {
-    return x.rotate(n);
-  }
-  
-  
-  
-  // realvec wrappers
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size>
-  loada(real_t const* p,
-        realvec<real_t, size> x,
-        typename realvec<real_t, size>::mask_t const& m)
-  {
-    return x.loada(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size>
-  loadu(real_t const* p,
-        realvec<real_t, size> x,
-        typename realvec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size>
-  loadu(real_t const* p, size_t ioff,
-        realvec<real_t, size> x,
-        typename realvec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, ioff, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realvec<real_t, size> x, real_t* p)
-  {
-    x.storea(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realvec<real_t, size> x, real_t* p)
-  {
-    x.storeu(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff)
-  {
-    x.storeu(p, ioff);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realvec<real_t, size> x, real_t* p,
-                     typename realvec<real_t, size>::mask_t const& m)
-  {
-    x.storea(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realvec<real_t, size> x, real_t* p,
-                     typename realvec<real_t, size>::mask_t const& m)
-  {
-    x.storeu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff,
-                     typename realvec<real_t, size>::mask_t const &m)
-  {
-    x.storeu(p, ioff, m);
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> as_int(realvec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> convert_int(realvec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  typename realvec<real_t, size>::real_t maxval(realvec<real_t, size> x)
-  {
-    return x.maxval();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  typename realvec<real_t, size>::real_t minval(realvec<real_t, size> x)
-  {
-    return x.minval();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  typename realvec<real_t, size>::real_t prod(realvec<real_t, size> x)
-  {
-    return x.prod();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  typename realvec<real_t, size>::real_t sum(realvec<real_t, size> x)
-  {
-    return x.sum();
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> acos(realvec<real_t, size> x)
-  {
-    return x.acos();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> acosh(realvec<real_t, size> x)
-  {
-    return x.acosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> asin(realvec<real_t, size> x)
-  {
-    return x.asin();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> asinh(realvec<real_t, size> x)
-  {
-    return x.asinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> atan(realvec<real_t, size> x)
-  {
-    return x.atan();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> atan2(realvec<real_t, size> x,
-                                     realvec<real_t, size> y)
-  {
-    return x.atan2(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> atanh(realvec<real_t, size> x)
-  {
-    return x.atanh();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> cbrt(realvec<real_t, size> x)
-  {
-    return x.cbrt();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> ceil(realvec<real_t, size> x)
-  {
-    return x.ceil();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> copysign(realvec<real_t, size> x,
-                                        realvec<real_t, size> y)
-  {
-    return x.copysign(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> cos(realvec<real_t, size> x)
-  {
-    return x.cos();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> cosh(realvec<real_t, size> x)
-  {
-    return x.cosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> exp(realvec<real_t, size> x)
-  {
-    return x.exp();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> exp10(realvec<real_t, size> x)
-  {
-    return x.exp10();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> exp2(realvec<real_t, size> x)
-  {
-    return x.exp2();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> expm1(realvec<real_t, size> x)
-  {
-    return x.expm1();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> fabs(realvec<real_t, size> x)
-  {
-    return x.fabs();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> floor(realvec<real_t, size> x)
-  {
-    return x.floor();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> fdim(realvec<real_t, size> x,
-                                    realvec<real_t, size> y)
-  {
-    return x.fdim(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> fma(realvec<real_t, size> x,
-                                   realvec<real_t, size> y,
-                                   realvec<real_t, size> z)
-  {
-    return x.fma(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> fmax(realvec<real_t, size> x,
-                                    realvec<real_t, size> y)
-  {
-    return x.fmax(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> fmin(realvec<real_t, size> x,
-                                    realvec<real_t, size> y)
-  {
-    return x.fmin(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> fmod(realvec<real_t, size> x,
-                                    realvec<real_t, size> y)
-  {
-    return x.fmod(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> frexp(realvec<real_t, size> x,
-                                     intvec<real_t, size>* r)
-  {
-    return x.frexp(r);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> hypot(realvec<real_t, size> x,
-                                     realvec<real_t, size> y)
-  {
-    return x.hypot(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intvec<real_t, size> ilogb(realvec<real_t, size> x)
-  {
-    return x.ilogb();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> isfinite(realvec<real_t, size> x)
-  {
-    return x.isfinite();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> isinf(realvec<real_t, size> x)
-  {
-    return x.isinf();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> isnan(realvec<real_t, size> x)
-  {
-    return x.isnan();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> isnormal(realvec<real_t, size> x)
-  {
-    return x.isnormal();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realvec<real_t, size> ldexp(realvec<real_t, size> x,
-                              typename intvec<real_t, size>::int_t n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realvec<real_t, size> ldexp(realvec<real_t, size> x,
-                               intvec<real_t, size> n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> log(realvec<real_t, size> x)
-  {
-    return x.log();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> log10(realvec<real_t, size> x)
-  {
-    return x.log10();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> log1p(realvec<real_t, size> x)
-  {
-    return x.log1p();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> log2(realvec<real_t, size> x)
-  {
-    return x.log2();
-  }
-    
-  template<typename real_t, int size>
-  inline intvec<real_t, size> lrint(realvec<real_t, size> x)
-  {
-    return x.lrint();
-  }
+template <typename real_t, int size> struct intvec {};
+
+template <typename real_t, int size> struct realvec {};
+
+// boolvec wrappers
+
+template <typename real_t, int size>
+inline intvec<real_t, size> as_int(boolvec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> convert_int(boolvec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size> inline bool all(boolvec<real_t, size> x) {
+  return x.all();
+}
+
+template <typename real_t, int size> inline bool any(boolvec<real_t, size> x) {
+  return x.any();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> ifthen(boolvec<real_t, size> c,
+                                    boolvec<real_t, size> x,
+                                    boolvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> ifthen(boolvec<real_t, size> c,
+                                   intvec<real_t, size> x,
+                                   intvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> ifthen(boolvec<real_t, size> c,
+                                    realvec<real_t, size> x,
+                                    realvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+// intvec wrappers
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> as_bool(intvec<real_t, size> x) {
+  return x.as_bool();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> convert_bool(intvec<real_t, size> x) {
+  return x.convert_bool();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> as_float(intvec<real_t, size> x) {
+  return x.as_float();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> convert_float(intvec<real_t, size> x) {
+  return x.convert_float();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> abs(intvec<real_t, size> x) {
+  return x.abs();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> bitifthen(intvec<real_t, size> x,
+                                      intvec<real_t, size> y,
+                                      intvec<real_t, size> z) {
+  return x.bitifthen(y, z);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> clz(intvec<real_t, size> x) {
+  return x.clz();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isignbit(intvec<real_t, size> x) {
+  return x.isignbit();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> lsr(intvec<real_t, size> x,
+                                typename intvec<real_t, size>::int_t n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> lsr(intvec<real_t, size> x,
+                                intvec<real_t, size> n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> max(intvec<real_t, size> x,
+                                intvec<real_t, size> y) {
+  return x.max(y);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> min(intvec<real_t, size> x,
+                                intvec<real_t, size> y) {
+  return x.min(y);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> popcount(intvec<real_t, size> x) {
+  return x.popcount();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> rotate(intvec<real_t, size> x,
+                                   typename intvec<real_t, size>::int_t n) {
+  return x.rotate(n);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> rotate(intvec<real_t, size> x,
+                                   intvec<real_t, size> n) {
+  return x.rotate(n);
+}
+
+// realvec wrappers
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+loada(real_t const *p, realvec<real_t, size> x,
+      typename realvec<real_t, size>::mask_t const &m) {
+  return x.loada(p, m);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+loadu(real_t const *p, realvec<real_t, size> x,
+      typename realvec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, m);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+loadu(real_t const *p, size_t ioff, realvec<real_t, size> x,
+      typename realvec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline void storea(realvec<real_t, size> x, real_t *p) {
+  x.storea(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realvec<real_t, size> x, real_t *p) {
+  x.storeu(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realvec<real_t, size> x, real_t *p, size_t ioff) {
+  x.storeu(p, ioff);
+}
+
+template <typename real_t, int size>
+inline void storea(realvec<real_t, size> x, real_t *p,
+                   typename realvec<real_t, size>::mask_t const &m) {
+  x.storea(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realvec<real_t, size> x, real_t *p,
+                   typename realvec<real_t, size>::mask_t const &m) {
+  x.storeu(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realvec<real_t, size> x, real_t *p, size_t ioff,
+                   typename realvec<real_t, size>::mask_t const &m) {
+  x.storeu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> as_int(realvec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> convert_int(realvec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline typename realvec<real_t, size>::real_t maxval(realvec<real_t, size> x) {
+  return x.maxval();
+}
+
+template <typename real_t, int size>
+inline typename realvec<real_t, size>::real_t minval(realvec<real_t, size> x) {
+  return x.minval();
+}
+
+template <typename real_t, int size>
+inline typename realvec<real_t, size>::real_t prod(realvec<real_t, size> x) {
+  return x.prod();
+}
+
+template <typename real_t, int size>
+inline typename realvec<real_t, size>::real_t sum(realvec<real_t, size> x) {
+  return x.sum();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> acos(realvec<real_t, size> x) {
+  return x.acos();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> acosh(realvec<real_t, size> x) {
+  return x.acosh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> asin(realvec<real_t, size> x) {
+  return x.asin();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> asinh(realvec<real_t, size> x) {
+  return x.asinh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> atan(realvec<real_t, size> x) {
+  return x.atan();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> atan2(realvec<real_t, size> x,
+                                   realvec<real_t, size> y) {
+  return x.atan2(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> atanh(realvec<real_t, size> x) {
+  return x.atanh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> cbrt(realvec<real_t, size> x) {
+  return x.cbrt();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> ceil(realvec<real_t, size> x) {
+  return x.ceil();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> copysign(realvec<real_t, size> x,
+                                      realvec<real_t, size> y) {
+  return x.copysign(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> cos(realvec<real_t, size> x) {
+  return x.cos();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> cosh(realvec<real_t, size> x) {
+  return x.cosh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> exp(realvec<real_t, size> x) {
+  return x.exp();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> exp10(realvec<real_t, size> x) {
+  return x.exp10();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> exp2(realvec<real_t, size> x) {
+  return x.exp2();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> expm1(realvec<real_t, size> x) {
+  return x.expm1();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fabs(realvec<real_t, size> x) {
+  return x.fabs();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> floor(realvec<real_t, size> x) {
+  return x.floor();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fdim(realvec<real_t, size> x,
+                                  realvec<real_t, size> y) {
+  return x.fdim(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+fma(realvec<real_t, size> x, realvec<real_t, size> y, realvec<real_t, size> z) {
+  return x.fma(y, z);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fmax(realvec<real_t, size> x,
+                                  realvec<real_t, size> y) {
+  return x.fmax(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fmin(realvec<real_t, size> x,
+                                  realvec<real_t, size> y) {
+  return x.fmin(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fmod(realvec<real_t, size> x,
+                                  realvec<real_t, size> y) {
+  return x.fmod(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> frexp(realvec<real_t, size> x,
+                                   intvec<real_t, size> *r) {
+  return x.frexp(r);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> hypot(realvec<real_t, size> x,
+                                   realvec<real_t, size> y) {
+  return x.hypot(y);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> ilogb(realvec<real_t, size> x) {
+  return x.ilogb();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isfinite(realvec<real_t, size> x) {
+  return x.isfinite();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isinf(realvec<real_t, size> x) {
+  return x.isinf();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isnan(realvec<real_t, size> x) {
+  return x.isnan();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isnormal(realvec<real_t, size> x) {
+  return x.isnormal();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> ldexp(realvec<real_t, size> x,
+                                   typename intvec<real_t, size>::int_t n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> ldexp(realvec<real_t, size> x,
+                                   intvec<real_t, size> n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> log(realvec<real_t, size> x) {
+  return x.log();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> log10(realvec<real_t, size> x) {
+  return x.log10();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> log1p(realvec<real_t, size> x) {
+  return x.log1p();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> log2(realvec<real_t, size> x) {
+  return x.log2();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> lrint(realvec<real_t, size> x) {
+  return x.lrint();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+mad(realvec<real_t, size> x, realvec<real_t, size> y, realvec<real_t, size> z) {
+  return x.mad(y, z);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> nextafter(realvec<real_t, size> x,
+                                       realvec<real_t, size> y) {
+  return x.nextafter(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> pow(realvec<real_t, size> x,
+                                 realvec<real_t, size> y) {
+  return x.pow(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> rcp(realvec<real_t, size> x) {
+  return x.rcp();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> remainder(realvec<real_t, size> x,
+                                       realvec<real_t, size> y) {
+  return x.remainder(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> rint(realvec<real_t, size> x) {
+  return x.rint();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> round(realvec<real_t, size> x) {
+  return x.round();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> rsqrt(realvec<real_t, size> x) {
+  return x.rsqrt();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> signbit(realvec<real_t, size> x) {
+  return x.signbit();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> sin(realvec<real_t, size> x) {
+  return x.sin();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> sinh(realvec<real_t, size> x) {
+  return x.sinh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> sqrt(realvec<real_t, size> x) {
+  return x.sqrt();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> tan(realvec<real_t, size> x) {
+  return x.tan();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> tanh(realvec<real_t, size> x) {
+  return x.tanh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> trunc(realvec<real_t, size> x) {
+  return x.trunc();
+}
 
-  template<typename real_t, int size>
-  inline realvec<real_t, size> mad(realvec<real_t, size> x,
-                                   realvec<real_t, size> y,
-                                   realvec<real_t, size> z)
-  {
-    return x.mad(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> nextafter(realvec<real_t, size> x,
-                                         realvec<real_t, size> y)
-  {
-    return x.nextafter(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> pow(realvec<real_t, size> x,
-                                   realvec<real_t, size> y)
-  {
-    return x.pow(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> rcp(realvec<real_t, size> x)
-  {
-    return x.rcp();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> remainder(realvec<real_t, size> x,
-                                         realvec<real_t, size> y)
-  {
-    return x.remainder(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> rint(realvec<real_t, size> x)
-  {
-    return x.rint();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> round(realvec<real_t, size> x)
-  {
-    return x.round();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> rsqrt(realvec<real_t, size> x)
-  {
-    return x.rsqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline boolvec<real_t, size> signbit(realvec<real_t, size> x)
-  {
-    return x.signbit();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> sin(realvec<real_t, size> x)
-  {
-    return x.sin();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> sinh(realvec<real_t, size> x)
-  {
-    return x.sinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> sqrt(realvec<real_t, size> x)
-  {
-    return x.sqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> tan(realvec<real_t, size> x)
-  {
-    return x.tan();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> tanh(realvec<real_t, size> x)
-  {
-    return x.tanh();
-  }
-  
-  template<typename real_t, int size>
-  inline realvec<real_t, size> trunc(realvec<real_t, size> x)
-  {
-    return x.trunc();
-  }
-  
-  
-  
 #ifndef VML_NO_IOSTREAM
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os, boolvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os, intvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os, realvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, boolvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, intvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, realvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
 #endif
-  
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_BASE_H
+#endif // #ifndef VEC_BASE_H
diff --git a/vec_builtin.h b/vec_builtin.h
index bbe4277..2f1ff90 100644
--- a/vec_builtin.h
+++ b/vec_builtin.h
@@ -12,1450 +12,1253 @@
 #include <cmath>
 #include <cstring>
 #ifndef VML_NO_IOSTREAM
-#  include <sstream>
+#include <sstream>
 #endif
 #include <string>
 
+namespace vecmathlib {
 
+template <typename T, int N> struct boolbuiltinvec;
+template <typename T, int N> struct intbuiltinvec;
+template <typename T, int N> struct realbuiltinvec;
 
-namespace vecmathlib {
-  
-  template<typename T, int N> struct boolbuiltinvec;
-  template<typename T, int N> struct intbuiltinvec;
-  template<typename T, int N> struct realbuiltinvec;
-  
-  
-  
-  template<typename T, int N>
-  struct boolbuiltinvec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static const int size = N;
-    typedef bool scalar_t;
-    typedef int_t bvector_t __attribute__((__ext_vector_type__(N)));
-    static const int alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true is -1, false is 0
-    static int_t from_bool(bool a) { return -uint_t(a); }
-    static bool to_bool(int_t a) { return a; }
-  public:
-    
-    typedef boolbuiltinvec boolvec_t;
-    typedef intbuiltinvec<real_t, size> intvec_t;
-    typedef realbuiltinvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolbuiltinvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolbuiltinvec(const boolbuiltinvec& x): v(x.v) {}
-    // boolbuiltinvec& operator=(const boolbuiltinvec& x) { return v=x.v, *this; }
-    // Can't have a constructor from bvector_t, since this would
-    // conflict with the constructor from bool
-    // boolbuiltinvec(bvector_t x): v(x) {}
-    static boolvec_t mkvec(bvector_t x) { boolvec_t res; res.v=x; return res; }
-    boolbuiltinvec(bool a): v(from_bool(a)) {}
-    boolbuiltinvec(const bool* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return to_bool(v[n]); }
-    boolvec_t& set_elt(int n, bool a) { return v[n]=from_bool(a), *this; }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intbuiltinvec
-    intvec_t convert_int() const; // defined after intbuiltinvec
-    
-    
-    
-    boolvec_t operator!() const { return mkvec(!v); }
-    
-    boolvec_t operator&&(boolvec_t x) const { return mkvec(v && x.v); }
-    boolvec_t operator||(boolvec_t x) const { return mkvec(v || x.v); }
-    boolvec_t operator==(boolvec_t x) const { return mkvec(v == x.v); }
-    boolvec_t operator!=(boolvec_t x) const { return mkvec(v != x.v); }
-    
-    bool all() const
-    {
-      bool res = (*this)[0];
-      for (int d=1; d<size; ++d) res = res && (*this)[d];
-      return res;
-    }
-    bool any() const
-    {
-      bool res = (*this)[0];
-      for (int d=1; d<size; ++d) res = res || (*this)[d];
-      return res;
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intbuiltinvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realbuiltinvec
-  };
-  
-  
-  
-  template<typename T, int N>
-  struct intbuiltinvec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static const int size = N;
-    typedef int_t scalar_t;
-    typedef int_t ivector_t __attribute__((__ext_vector_type__(N)));
-    typedef uint_t uvector_t __attribute__((__ext_vector_type__(N)));
-    static const int alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    static_assert(size * sizeof(real_t) == sizeof(uvector_t),
-                  "vector size is wrong");
-    
-    typedef boolbuiltinvec<real_t, size> boolvec_t;
-    typedef intbuiltinvec intvec_t;
-    typedef realbuiltinvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intbuiltinvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intbuiltinvec(const intbuiltinvec& x): v(x.v) {}
-    // intbuiltinvec& operator=(const intbuiltinvec& x) { return v=x.v, *this; }
-    // Can't have a constructor from ivector_t, since this would
-    // conflict with the constructor from int_t
-    // intbuiltinvec(ivector_t x): v(x) {}
-    static intvec_t mkvec(ivector_t x) { intvec_t res; res.v=x; return res; }
-    intbuiltinvec(int_t a): v(a) {}
-    intbuiltinvec(const int_t* as) { std::memcpy(&v, as, sizeof v); }
-    static intvec_t iota()
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.set_elt(d, d);
-      return res;
-    }
-    
-    int_t operator[](int n) const { return v[n]; }
-    intvec_t& set_elt(int n, int_t a) { return v[n]=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const
-    {
-      boolvec_t res;
-      std::memcpy(&res.v, &v, sizeof res.v);
-      return res;
-    }
-    boolvec_t convert_bool() const { return *this != IV(I(0)); }
-    realvec_t as_float() const;      // defined after realbuiltinvec
-    realvec_t convert_float() const; // defined after realbuiltinvec
-    
-    
-    
-    intvec_t operator+() const { return mkvec(+v); }
-    intvec_t operator-() const { return mkvec(-v); }
-    
-    intvec_t operator+(intvec_t x) const { return mkvec(v + x.v); }
-    intvec_t operator-(intvec_t x) const { return mkvec(v - x.v); }
-    intvec_t operator*(intvec_t x) const { return mkvec(v * x.v); }
-    intvec_t operator/(intvec_t x) const { return mkvec(v / x.v); }
-    intvec_t operator%(intvec_t x) const { return mkvec(v % x.v); }
-    
-    intvec_t& operator+=(const intvec_t& x) { return *this=*this+x; }
-    intvec_t& operator-=(const intvec_t& x) { return *this=*this-x; }
-    intvec_t& operator*=(const intvec_t& x) { return *this=*this*x; }
-    intvec_t& operator/=(const intvec_t& x) { return *this=*this/x; }
-    intvec_t& operator%=(const intvec_t& x) { return *this=*this%x; }
-    
-    
-    
-    intvec_t operator~() const { return mkvec(~v); }
-    
-    intvec_t operator&(intvec_t x) const { return mkvec(v & x.v); }
-    intvec_t operator|(intvec_t x) const { return mkvec(v | x.v); }
-    intvec_t operator^(intvec_t x) const { return mkvec(v ^ x.v); }
-    
-    intvec_t& operator&=(const intvec_t& x) { return *this=*this&x; }
-    intvec_t& operator|=(const intvec_t& x) { return *this=*this|x; }
-    intvec_t& operator^=(const intvec_t& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const
-    {
-      return MF::vml_bitifthen(*this, x, y);
-    }
-    
-    
-    
-    intvec_t lsr(int_t n) const
-    {
-      return mkvec(ivector_t(uvector_t(v) >> U(n)));
-    }
-    intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); }
-    intvec_t operator>>(int_t n) const { return mkvec(v >> n); }
-    intvec_t operator<<(int_t n) const { return mkvec(v << n); }
-    
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      return mkvec(ivector_t(uvector_t(v)>>uvector_t(n.v)));
-    }
-    intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); }
-    intvec_t operator>>(intvec_t n) const { return mkvec(v >> n.v); }
-    intvec_t operator<<(intvec_t n) const { return mkvec(v << n.v); }
-    
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) {
-        int_t val = (*this)[d];
-        int_t cnt = val == 0 ? CHAR_BIT * sizeof val : builtin_clz(U(val));
-        res.set_elt(d, cnt);
-      }
-      return res;
-    }
-    intvec_t popcount() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_popcount(U((*this)[d])));
-      }
-      return res;
-    }
-    
-    
-    
-    boolvec_t operator==(const intvec_t& x) const
-    {
-      return boolvec_t::mkvec(v == x.v);
-    }
-    boolvec_t operator!=(const intvec_t& x) const
-    {
-      return boolvec_t::mkvec(v != x.v);
-    }
-    boolvec_t operator<(const intvec_t& x) const
-    {
-      return boolvec_t::mkvec(v < x.v);
-    }
-    boolvec_t operator<=(const intvec_t& x) const
-    {
-      return boolvec_t::mkvec(v <= x.v);
-    }
-    boolvec_t operator>(const intvec_t& x) const
-    {
-      return boolvec_t::mkvec(v > x.v);
-    }
-    boolvec_t operator>=(const intvec_t& x) const
-    {
-      return boolvec_t::mkvec(v >= x.v);
+template <typename T, int N> struct boolbuiltinvec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static const int size = N;
+  typedef bool scalar_t;
+  typedef int_t bvector_t __attribute__((__ext_vector_type__(N)));
+  static const int alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true is -1, false is 0
+  static int_t from_bool(bool a) { return -uint_t(a); }
+  static bool to_bool(int_t a) { return a; }
+
+public:
+  typedef boolbuiltinvec boolvec_t;
+  typedef intbuiltinvec<real_t, size> intvec_t;
+  typedef realbuiltinvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolbuiltinvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolbuiltinvec(const boolbuiltinvec& x): v(x.v) {}
+  // boolbuiltinvec& operator=(const boolbuiltinvec& x) { return v=x.v, *this; }
+  // Can't have a constructor from bvector_t, since this would
+  // conflict with the constructor from bool
+  // boolbuiltinvec(bvector_t x): v(x) {}
+  static boolvec_t mkvec(bvector_t x) {
+    boolvec_t res;
+    res.v = x;
+    return res;
+  }
+  boolbuiltinvec(bool a) : v(from_bool(a)) {}
+  boolbuiltinvec(const bool *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const { return to_bool(v[n]); }
+  boolvec_t &set_elt(int n, bool a) { return v[n] = from_bool(a), *this; }
+
+  intvec_t as_int() const;      // defined after intbuiltinvec
+  intvec_t convert_int() const; // defined after intbuiltinvec
+
+  boolvec_t operator!() const { return mkvec(!v); }
+
+  boolvec_t operator&&(boolvec_t x) const { return mkvec(v && x.v); }
+  boolvec_t operator||(boolvec_t x) const { return mkvec(v || x.v); }
+  boolvec_t operator==(boolvec_t x) const { return mkvec(v == x.v); }
+  boolvec_t operator!=(boolvec_t x) const { return mkvec(v != x.v); }
+
+  bool all() const {
+    bool res = (*this)[0];
+    for (int d = 1; d < size; ++d)
+      res = res && (*this)[d];
+    return res;
+  }
+  bool any() const {
+    bool res = (*this)[0];
+    for (int d = 1; d < size; ++d)
+      res = res || (*this)[d];
+    return res;
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intbuiltinvec
+  realvec_t ifthen(realvec_t x,
+                   realvec_t y) const; // defined after realbuiltinvec
+};
+
+template <typename T, int N> struct intbuiltinvec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static const int size = N;
+  typedef int_t scalar_t;
+  typedef int_t ivector_t __attribute__((__ext_vector_type__(N)));
+  typedef uint_t uvector_t __attribute__((__ext_vector_type__(N)));
+  static const int alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+  static_assert(size * sizeof(real_t) == sizeof(uvector_t),
+                "vector size is wrong");
+
+  typedef boolbuiltinvec<real_t, size> boolvec_t;
+  typedef intbuiltinvec intvec_t;
+  typedef realbuiltinvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intbuiltinvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intbuiltinvec(const intbuiltinvec& x): v(x.v) {}
+  // intbuiltinvec& operator=(const intbuiltinvec& x) { return v=x.v, *this; }
+  // Can't have a constructor from ivector_t, since this would
+  // conflict with the constructor from int_t
+  // intbuiltinvec(ivector_t x): v(x) {}
+  static intvec_t mkvec(ivector_t x) {
+    intvec_t res;
+    res.v = x;
+    return res;
+  }
+  intbuiltinvec(int_t a) : v(a) {}
+  intbuiltinvec(const int_t *as) { std::memcpy(&v, as, sizeof v); }
+  static intvec_t iota() {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.set_elt(d, d);
+    return res;
+  }
+
+  int_t operator[](int n) const { return v[n]; }
+  intvec_t &set_elt(int n, int_t a) { return v[n] = a, *this; }
+
+  boolvec_t as_bool() const {
+    boolvec_t res;
+    std::memcpy(&res.v, &v, sizeof res.v);
+    return res;
+  }
+  boolvec_t convert_bool() const { return *this != IV(I(0)); }
+  realvec_t as_float() const;      // defined after realbuiltinvec
+  realvec_t convert_float() const; // defined after realbuiltinvec
+
+  intvec_t operator+() const { return mkvec(+v); }
+  intvec_t operator-() const { return mkvec(-v); }
+
+  intvec_t operator+(intvec_t x) const { return mkvec(v + x.v); }
+  intvec_t operator-(intvec_t x) const { return mkvec(v - x.v); }
+  intvec_t operator*(intvec_t x) const { return mkvec(v * x.v); }
+  intvec_t operator/(intvec_t x) const { return mkvec(v / x.v); }
+  intvec_t operator%(intvec_t x) const { return mkvec(v % x.v); }
+
+  intvec_t &operator+=(const intvec_t &x) { return *this = *this + x; }
+  intvec_t &operator-=(const intvec_t &x) { return *this = *this - x; }
+  intvec_t &operator*=(const intvec_t &x) { return *this = *this * x; }
+  intvec_t &operator/=(const intvec_t &x) { return *this = *this / x; }
+  intvec_t &operator%=(const intvec_t &x) { return *this = *this % x; }
+
+  intvec_t operator~() const { return mkvec(~v); }
+
+  intvec_t operator&(intvec_t x) const { return mkvec(v & x.v); }
+  intvec_t operator|(intvec_t x) const { return mkvec(v | x.v); }
+  intvec_t operator^(intvec_t x) const { return mkvec(v ^ x.v); }
+
+  intvec_t &operator&=(const intvec_t &x) { return *this = *this & x; }
+  intvec_t &operator|=(const intvec_t &x) { return *this = *this | x; }
+  intvec_t &operator^=(const intvec_t &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const {
+    return MF::vml_bitifthen(*this, x, y);
+  }
+
+  intvec_t lsr(int_t n) const { return mkvec(ivector_t(uvector_t(v) >> U(n))); }
+  intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); }
+  intvec_t operator>>(int_t n) const { return mkvec(v >> n); }
+  intvec_t operator<<(int_t n) const { return mkvec(v << n); }
+
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+    return mkvec(ivector_t(uvector_t(v) >> uvector_t(n.v)));
+  }
+  intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); }
+  intvec_t operator>>(intvec_t n) const { return mkvec(v >> n.v); }
+  intvec_t operator<<(intvec_t n) const { return mkvec(v << n.v); }
+
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d) {
+      int_t val = (*this)[d];
+      int_t cnt = val == 0 ? CHAR_BIT * sizeof val : builtin_clz(U(val));
+      res.set_elt(d, cnt);
     }
-    
-    intvec_t abs() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.set_elt(d, builtin_abs((*this)[d]));
-      return res;
+    return res;
+  }
+  intvec_t popcount() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_popcount(U((*this)[d])));
     }
-    
-    boolvec_t isignbit() const { return MF::vml_isignbit(*this); }
-    
-    intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); }
-    intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); }
-  };
-  
-  
-  
-  template<typename T, int N>
-  struct realbuiltinvec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static const int size = N;
-    typedef real_t scalar_t;
-    typedef real_t vector_t __attribute__((__ext_vector_type__(N)));
-    static const int alignment = sizeof(vector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
+    return res;
+  }
+
+  boolvec_t operator==(const intvec_t &x) const {
+    return boolvec_t::mkvec(v == x.v);
+  }
+  boolvec_t operator!=(const intvec_t &x) const {
+    return boolvec_t::mkvec(v != x.v);
+  }
+  boolvec_t operator<(const intvec_t &x) const {
+    return boolvec_t::mkvec(v < x.v);
+  }
+  boolvec_t operator<=(const intvec_t &x) const {
+    return boolvec_t::mkvec(v <= x.v);
+  }
+  boolvec_t operator>(const intvec_t &x) const {
+    return boolvec_t::mkvec(v > x.v);
+  }
+  boolvec_t operator>=(const intvec_t &x) const {
+    return boolvec_t::mkvec(v >= x.v);
+  }
+
+  intvec_t abs() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.set_elt(d, builtin_abs((*this)[d]));
+    return res;
+  }
+
+  boolvec_t isignbit() const { return MF::vml_isignbit(*this); }
+
+  intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); }
+  intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); }
+};
+
+template <typename T, int N> struct realbuiltinvec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static const int size = N;
+  typedef real_t scalar_t;
+  typedef real_t vector_t __attribute__((__ext_vector_type__(N)));
+  static const int alignment = sizeof(vector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
 #ifndef VML_NO_IOSTREAM
-    static const char* name()
-    {
-      static std::string name_;
-      if (name_.empty()) {
-        std::stringstream buf;
-        buf << "<builtin:" << N << "*" << FP::name() << ">";
-        name_ = buf.str();
-      }
-      return name_.c_str();
+  static const char *name() {
+    static std::string name_;
+    if (name_.empty()) {
+      std::stringstream buf;
+      buf << "<builtin:" << N << "*" << FP::name() << ">";
+      name_ = buf.str();
     }
+    return name_.c_str();
+  }
 #endif
-    void barrier() { volatile vector_t x __attribute__((__unused__)) = v; }
-    
-    typedef boolbuiltinvec<real_t, size> boolvec_t;
-    typedef intbuiltinvec<real_t, size> intvec_t;
-    typedef realbuiltinvec realvec_t;
-    
-  private:
-    boolvec_t mapb(bool f(real_t)) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
-      return res;
-    }
-    intvec_t map(int_t f(real_t)) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t)) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t, int_t), intvec_t x) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t, int_t*), intvec_t* x) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) {
-        int_t ix;
-        res.v[d] = f(v[d], &ix);
-        x->set_elt(d, ix);
-      }
-      return res;
-    }
-    realvec_t map(real_t f(real_t, real_t), realvec_t x) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t, real_t, real_t),
-                  realvec_t x, realvec_t y) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d], y.v[d]);
-      return res;
+  void barrier() { volatile vector_t x __attribute__((__unused__)) = v; }
+
+  typedef boolbuiltinvec<real_t, size> boolvec_t;
+  typedef intbuiltinvec<real_t, size> intvec_t;
+  typedef realbuiltinvec realvec_t;
+
+private:
+  boolvec_t mapb(bool f(real_t)) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d]);
+    return res;
+  }
+  intvec_t map(int_t f(real_t)) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t)) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, int_t), intvec_t x) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d], x.v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, int_t *), intvec_t *x) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d) {
+      int_t ix;
+      res.v[d] = f(v[d], &ix);
+      x->set_elt(d, ix);
     }
-  public:
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realbuiltinvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realbuiltinvec(const realbuiltinvec& x): v(x.v) {}
-    // realbuiltinvec& operator=(const realbuiltinvec& x) { return v=x.v, *this; }
-    // Can't have a constructor from vector_t, since this would
-    // conflict with the constructor from real_t
-    // realbuiltinvec(vector_t x): v(x) {}
-    static realvec_t mkvec(vector_t x) { realvec_t res; res.v=x; return res; }
-    realbuiltinvec(real_t a): v(a) {}
-    realbuiltinvec(const real_t* as) { std::memcpy(&v, as, sizeof v); }
-    
-    real_t operator[](int n) const { return v[n]; }
-    realvec_t& set_elt(int n, real_t a) { return v[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(const real_t* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, real_t), realvec_t x) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d], x.v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, real_t, real_t), realvec_t x,
+                realvec_t y) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d], x.v[d], y.v[d]);
+    return res;
+  }
+
+public:
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realbuiltinvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realbuiltinvec(const realbuiltinvec& x): v(x.v) {}
+  // realbuiltinvec& operator=(const realbuiltinvec& x) { return v=x.v, *this; }
+  // Can't have a constructor from vector_t, since this would
+  // conflict with the constructor from real_t
+  // realbuiltinvec(vector_t x): v(x) {}
+  static realvec_t mkvec(vector_t x) {
+    realvec_t res;
+    res.v = x;
+    return res;
+  }
+  realbuiltinvec(real_t a) : v(a) {}
+  realbuiltinvec(const real_t *as) { std::memcpy(&v, as, sizeof v); }
+
+  real_t operator[](int n) const { return v[n]; }
+  realvec_t &set_elt(int n, real_t a) { return v[n] = a, *this; }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(const real_t *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
 #if __has_builtin(__builtin_assume_aligned)
-      p = (const real_t*)__builtin_assume_aligned(p, sizeof(realvec_t));
+    p = (const real_t *)__builtin_assume_aligned(p, sizeof(realvec_t));
 #endif
-      return mkvec(*(const vector_t*)p);
-    }
-    static realvec_t loadu(const real_t* p)
-    {
-      // return mkvec(*(const vector_t*)p);
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.set_elt(d, p[d]);
-      return res;
-      // realvec_t res;
-      // memcpy(&res.v, p, sizeof res.v);
-      // return res;
-    }
-    static realvec_t loadu(const real_t* p, size_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(const real_t* p, const mask_t& m) const
-    {
-      return m.m.ifthen(loada(p), *this);
-    }
-    realvec_t loadu(const real_t* p, const mask_t& m) const
-    {
-      return m.m.ifthen(loadu(p), *this);
-    }
-    realvec_t loadu(const real_t* p, size_t ioff, const mask_t& m) const
-    {
-      return m.m.ifthen(loadu(p, ioff), *this);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
+    return mkvec(*(const vector_t *)p);
+  }
+  static realvec_t loadu(const real_t *p) {
+    // return mkvec(*(const vector_t*)p);
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.set_elt(d, p[d]);
+    return res;
+    // realvec_t res;
+    // memcpy(&res.v, p, sizeof res.v);
+    // return res;
+  }
+  static realvec_t loadu(const real_t *p, size_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(const real_t *p, const mask_t &m) const {
+    return m.m.ifthen(loada(p), *this);
+  }
+  realvec_t loadu(const real_t *p, const mask_t &m) const {
+    return m.m.ifthen(loadu(p), *this);
+  }
+  realvec_t loadu(const real_t *p, size_t ioff, const mask_t &m) const {
+    return m.m.ifthen(loadu(p, ioff), *this);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
 #if __has_builtin(__builtin_assume_aligned)
-      p = (real_t*)__builtin_assume_aligned(p, sizeof(realvec_t));
+    p = (real_t *)__builtin_assume_aligned(p, sizeof(realvec_t));
 #endif
-      *(vector_t*)p = v;
-    }
-    void storeu(real_t* p) const
-    {
-      // *(vector_t*)p = v;
-      for (int d=0; d<size; ++d) p[d] = (*this)[d];
-      // memcpy(p, &v, sizeof res.v);
-    }
-    void storeu(real_t* p, size_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, const mask_t& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p, m);
-    }
-    void storeu(real_t* p, const mask_t& m) const
-    {
-      for (int d=0; d<size; ++d) if (m.m[d]) p[d] = (*this)[d];
-    }
-    void storeu(real_t* p, size_t ioff, const mask_t& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const
-    {
-      intvec_t res;
-      std::memcpy(&res.v, &v, sizeof res.v);
-      return res;
-    }
-    intvec_t convert_int() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.set_elt(d, int_t((*this)[d]));
-      return res;
-    }
-    
-    
-    
-    realvec_t operator+() const { return mkvec(+v); }
-    realvec_t operator-() const { return mkvec(-v); }
-    
-    realvec_t operator+(realvec_t x) const { return mkvec(v + x.v); }
-    realvec_t operator-(realvec_t x) const { return mkvec(v - x.v); }
-    realvec_t operator*(realvec_t x) const { return mkvec(v * x.v); }
-    realvec_t operator/(realvec_t x) const { return mkvec(v / x.v); }
-    
-    realvec_t& operator+=(const realvec_t& x) { return *this=*this+x; }
-    realvec_t& operator-=(const realvec_t& x) { return *this=*this-x; }
-    realvec_t& operator*=(const realvec_t& x) { return *this=*this*x; }
-    realvec_t& operator/=(const realvec_t& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) {
-        res = builtin_fmax(res, (*this)[d]);
-      }
-      return res;
-    }
-    real_t minval() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) {
-        res = builtin_fmin(res, (*this)[d]);
-      }
-      return res;
-    }
-    real_t prod() const
-    {
-      real_t res = (*this)[0];
-      for (int d=1; d<size; ++d) res *= (*this)[d];
-      return res;
-    }
-    real_t sum() const
-    {
-      real_t res = (*this)[0];
-      for (int d=1; d<size; ++d) res += (*this)[d];
-      return res;
-    }
-    
-    
-    
-    boolvec_t operator==(const realvec_t& x) const
-    {
-      return boolvec_t::mkvec(v == x.v);
-    }
-    boolvec_t operator!=(const realvec_t& x) const
-    {
-      return boolvec_t::mkvec(v != x.v);
-    }
-    boolvec_t operator<(const realvec_t& x) const
-    {
-      return boolvec_t::mkvec(v < x.v);
-    }
-    boolvec_t operator<=(const realvec_t& x) const
-    {
-      return boolvec_t::mkvec(v <= x.v);
-    }
-    boolvec_t operator>(const realvec_t& x) const
-    {
-      return boolvec_t::mkvec(v > x.v);
-    }
-    boolvec_t operator>=(const realvec_t& x) const
-    {
-      return boolvec_t::mkvec(v >= x.v);
-    }
-    
-    
-    
-    realvec_t acos() const { return map(builtin_acos); }
-    realvec_t acosh() const { return map(builtin_acosh); }
-    realvec_t asin() const { return map(builtin_asin); }
-    realvec_t asinh() const { return map(builtin_asinh); }
-    realvec_t atan() const { return map(builtin_atan); }
-    realvec_t atan2(realvec_t y) const { return map(builtin_atan2, y); }
-    realvec_t atanh() const { return map(builtin_atanh); }
-    realvec_t cbrt() const { return map(builtin_cbrt); }
-    realvec_t ceil() const { return map(builtin_ceil); }
-    realvec_t copysign(realvec_t y) const { return map(builtin_copysign, y); }
-    realvec_t cos() const { return map(builtin_cos); }
-    realvec_t cosh() const { return map(builtin_cosh); }
-    realvec_t exp() const { return map(builtin_exp); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return map(builtin_exp2); }
-    realvec_t expm1() const { return map(builtin_expm1); }
-    realvec_t fabs() const { return map(builtin_fabs); }
-    realvec_t fdim(realvec_t y) const { return map(builtin_fdim, y); }
-    realvec_t floor() const { return map(builtin_floor); }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return map(builtin_fma, y, z);
-    }
-    realvec_t fmax(realvec_t y) const { return map(builtin_fmax, y); }
-    realvec_t fmin(realvec_t y) const { return map(builtin_fmin, y); }
-    realvec_t fmod(realvec_t y) const { return map(builtin_fmod, y); }
-    realvec_t frexp(intvec_t* r) const
-    {
-      realvec_t res;
-      intvec_t exp;
-      for (int d=0; d<size; ++d) {
-        real_t val = (*this)[d];
-        int iexp;
-        res.set_elt(d, __builtin_frexp(val, &iexp));
-        int_t jexp = int_t(iexp);
-        if (__builtin_isinf(val)) jexp = std::numeric_limits<int_t>::max();
-        if (__builtin_isnan(val)) jexp = std::numeric_limits<int_t>::min();
-        exp.set_elt(d, jexp);
-      }
-      *r = exp;
-      return res;
-    }
-    realvec_t hypot(realvec_t y) const { return map(builtin_hypot, y); }
-    intvec_t ilogb() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) {
-        real_t val = (*this)[d];
-        int iexp = __builtin_ilogb(val);
-        int_t jexp = int_t(iexp);
-        if (val == R(0.0)) jexp = std::numeric_limits<int_t>::min();
-        if (__builtin_isinf(val)) jexp = std::numeric_limits<int_t>::max();
-        if (__builtin_isnan(val)) jexp = std::numeric_limits<int_t>::min();
-        res.set_elt(d, jexp);
-      }
-      return res;
-    }
-    boolvec_t isfinite() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_isfinite((*this)[d]) != 0);
-      }
-      return res;
-    }
-    boolvec_t isinf() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_isinf((*this)[d]) != 0);
-      }
-      return res;
-    }
-    boolvec_t isnan() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_isnan((*this)[d]) != 0);
-      }
-      return res;
-    }
-    boolvec_t isnormal() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_isnormal((*this)[d]) != 0);
-      }
-      return res;
-    }
-    realvec_t ldexp(int_t n) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_ldexp((*this)[d], int(n)));
-      }
-      return res;
-    }
-    realvec_t ldexp(intvec_t n) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_ldexp((*this)[d], int(n[d])));
-      }
-      return res;
+    *(vector_t *)p = v;
+  }
+  void storeu(real_t *p) const {
+    // *(vector_t*)p = v;
+    for (int d = 0; d < size; ++d)
+      p[d] = (*this)[d];
+    // memcpy(p, &v, sizeof res.v);
+  }
+  void storeu(real_t *p, size_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, const mask_t &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p, m);
+  }
+  void storeu(real_t *p, const mask_t &m) const {
+    for (int d = 0; d < size; ++d)
+      if (m.m[d])
+        p[d] = (*this)[d];
+  }
+  void storeu(real_t *p, size_t ioff, const mask_t &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const {
+    intvec_t res;
+    std::memcpy(&res.v, &v, sizeof res.v);
+    return res;
+  }
+  intvec_t convert_int() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.set_elt(d, int_t((*this)[d]));
+    return res;
+  }
+
+  realvec_t operator+() const { return mkvec(+v); }
+  realvec_t operator-() const { return mkvec(-v); }
+
+  realvec_t operator+(realvec_t x) const { return mkvec(v + x.v); }
+  realvec_t operator-(realvec_t x) const { return mkvec(v - x.v); }
+  realvec_t operator*(realvec_t x) const { return mkvec(v * x.v); }
+  realvec_t operator/(realvec_t x) const { return mkvec(v / x.v); }
+
+  realvec_t &operator+=(const realvec_t &x) { return *this = *this + x; }
+  realvec_t &operator-=(const realvec_t &x) { return *this = *this - x; }
+  realvec_t &operator*=(const realvec_t &x) { return *this = *this * x; }
+  realvec_t &operator/=(const realvec_t &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d) {
+      res = builtin_fmax(res, (*this)[d]);
     }
-    realvec_t log() const { return map(builtin_log); }
-    realvec_t log10() const { return map(builtin_log10); }
-    realvec_t log1p() const { return map(builtin_log1p); }
-    realvec_t log2() const { return map(builtin_log2); }
-    intvec_t lrint() const
-    {
-      if (sizeof(int_t) <= sizeof(long)) {
-        return map(builtin_lrint);
-      } else if (sizeof(int_t) <= sizeof(long long)) {
-        return map(builtin_llrint);
-      }
-      __builtin_unreachable();
+    return res;
+  }
+  real_t minval() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d) {
+      res = builtin_fmin(res, (*this)[d]);
     }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
+    return res;
+  }
+  real_t prod() const {
+    real_t res = (*this)[0];
+    for (int d = 1; d < size; ++d)
+      res *= (*this)[d];
+    return res;
+  }
+  real_t sum() const {
+    real_t res = (*this)[0];
+    for (int d = 1; d < size; ++d)
+      res += (*this)[d];
+    return res;
+  }
+
+  boolvec_t operator==(const realvec_t &x) const {
+    return boolvec_t::mkvec(v == x.v);
+  }
+  boolvec_t operator!=(const realvec_t &x) const {
+    return boolvec_t::mkvec(v != x.v);
+  }
+  boolvec_t operator<(const realvec_t &x) const {
+    return boolvec_t::mkvec(v < x.v);
+  }
+  boolvec_t operator<=(const realvec_t &x) const {
+    return boolvec_t::mkvec(v <= x.v);
+  }
+  boolvec_t operator>(const realvec_t &x) const {
+    return boolvec_t::mkvec(v > x.v);
+  }
+  boolvec_t operator>=(const realvec_t &x) const {
+    return boolvec_t::mkvec(v >= x.v);
+  }
+
+  realvec_t acos() const { return map(builtin_acos); }
+  realvec_t acosh() const { return map(builtin_acosh); }
+  realvec_t asin() const { return map(builtin_asin); }
+  realvec_t asinh() const { return map(builtin_asinh); }
+  realvec_t atan() const { return map(builtin_atan); }
+  realvec_t atan2(realvec_t y) const { return map(builtin_atan2, y); }
+  realvec_t atanh() const { return map(builtin_atanh); }
+  realvec_t cbrt() const { return map(builtin_cbrt); }
+  realvec_t ceil() const { return map(builtin_ceil); }
+  realvec_t copysign(realvec_t y) const { return map(builtin_copysign, y); }
+  realvec_t cos() const { return map(builtin_cos); }
+  realvec_t cosh() const { return map(builtin_cosh); }
+  realvec_t exp() const { return map(builtin_exp); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return map(builtin_exp2); }
+  realvec_t expm1() const { return map(builtin_expm1); }
+  realvec_t fabs() const { return map(builtin_fabs); }
+  realvec_t fdim(realvec_t y) const { return map(builtin_fdim, y); }
+  realvec_t floor() const { return map(builtin_floor); }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return map(builtin_fma, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return map(builtin_fmax, y); }
+  realvec_t fmin(realvec_t y) const { return map(builtin_fmin, y); }
+  realvec_t fmod(realvec_t y) const { return map(builtin_fmod, y); }
+  realvec_t frexp(intvec_t *r) const {
+    realvec_t res;
+    intvec_t exp;
+    for (int d = 0; d < size; ++d) {
+      real_t val = (*this)[d];
+      int iexp;
+      res.set_elt(d, __builtin_frexp(val, &iexp));
+      int_t jexp = int_t(iexp);
+      if (__builtin_isinf(val))
+        jexp = std::numeric_limits<int_t>::max();
+      if (__builtin_isnan(val))
+        jexp = std::numeric_limits<int_t>::min();
+      exp.set_elt(d, jexp);
+    }
+    *r = exp;
+    return res;
+  }
+  realvec_t hypot(realvec_t y) const { return map(builtin_hypot, y); }
+  intvec_t ilogb() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d) {
+      real_t val = (*this)[d];
+      int iexp = __builtin_ilogb(val);
+      int_t jexp = int_t(iexp);
+      if (val == R(0.0))
+        jexp = std::numeric_limits<int_t>::min();
+      if (__builtin_isinf(val))
+        jexp = std::numeric_limits<int_t>::max();
+      if (__builtin_isnan(val))
+        jexp = std::numeric_limits<int_t>::min();
+      res.set_elt(d, jexp);
     }
-    realvec_t nextafter(realvec_t y) const { return map(builtin_nextafter, y); }
-    realvec_t pow(realvec_t y) const { return map(builtin_pow, y); }
-    realvec_t rcp() const { return RV(1.0) / *this; }
-    realvec_t remainder(realvec_t y) const { return map(builtin_remainder, y); }
-    realvec_t rint() const { return map(builtin_rint); }
-    realvec_t round() const { return map(builtin_round); }
-    realvec_t rsqrt() const { return RV(1.0) / sqrt(); }
-    boolvec_t signbit() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) {
-        res.set_elt(d, builtin_signbit((*this)[d]) != 0);
-      }
-      return res;
+    return res;
+  }
+  boolvec_t isfinite() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_isfinite((*this)[d]) != 0);
     }
-    realvec_t sin() const { return map(builtin_sin); }
-    realvec_t sinh() const { return map(builtin_sinh); }
-    realvec_t sqrt() const { return map(builtin_sqrt); }
-    realvec_t tan() const { return map(builtin_tan); }
-    realvec_t tanh() const { return map(builtin_tanh); }
-    realvec_t trunc() const { return map(builtin_trunc); }
-  };
-  
-  
-  
-  // boolbuiltinvec definitions
-  
-  template<typename T, int N>
-  inline
-  typename boolbuiltinvec<T,N>::intvec_t boolbuiltinvec<T,N>::as_int() const
-  {
-    intvec_t res;
-    std::memcpy(&res.v, &v, sizeof res.v);
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename boolbuiltinvec<T,N>::intvec_t
-  boolbuiltinvec<T,N>::convert_int() const
-  {
-    return - as_int();
-  }
-  
-  template<typename T, int N>
-  inline
-  typename boolbuiltinvec<T,N>::boolvec_t
-  boolbuiltinvec<T,N>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    // return v ? x.v : y.v;
+  boolvec_t isinf() const {
     boolvec_t res;
-    for (int d=0; d<size; ++d) res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_isinf((*this)[d]) != 0);
+    }
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename boolbuiltinvec<T,N>::intvec_t
-  boolbuiltinvec<T,N>::ifthen(intvec_t x, intvec_t y) const
-  {
-    // return v ? x.v : y.v;
-    intvec_t res;
-    for (int d=0; d<size; ++d) res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+  boolvec_t isnan() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_isnan((*this)[d]) != 0);
+    }
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename boolbuiltinvec<T,N>::realvec_t
-  boolbuiltinvec<T,N>::ifthen(realvec_t x, realvec_t y) const
-  {
-    // return v ? x.v : y.v;
-    realvec_t res;
-    for (int d=0; d<size; ++d) res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+  boolvec_t isnormal() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_isnormal((*this)[d]) != 0);
+    }
     return res;
   }
-  
-  
-  
-  // intbuiltinvec definitions
-  
-  template<typename T, int N>
-  inline
-  typename intbuiltinvec<T,N>::realvec_t intbuiltinvec<T,N>::as_float() const
-  {
+  realvec_t ldexp(int_t n) const {
     realvec_t res;
-    std::memcpy(&res.v, &v, sizeof res.v);
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_ldexp((*this)[d], int(n)));
+    }
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename intbuiltinvec<T,N>::realvec_t
-  intbuiltinvec<T,N>::convert_float() const
-  {
+  realvec_t ldexp(intvec_t n) const {
     realvec_t res;
-    for (int d=0; d<size; ++d) res.set_elt(d, real_t((*this)[d]));
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_ldexp((*this)[d], int(n[d])));
+    }
     return res;
   }
-  
-  
-  
-  // Wrappers
-  
-  // boolbuiltinvec wrappers
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> as_int(boolbuiltinvec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> convert_int(boolbuiltinvec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline bool all(boolbuiltinvec<real_t, size> x) { return x.all(); }
-  
-  template<typename real_t, int size>
-  inline bool any(boolbuiltinvec<real_t, size> x) { return x.any(); }
-  
-  template<typename real_t, int size>
-  inline
-  boolbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
-                                      boolbuiltinvec<real_t, size> x,
-                                      boolbuiltinvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
-                                     intbuiltinvec<real_t, size> x,
-                                     intbuiltinvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
-                                      realbuiltinvec<real_t, size> x,
-                                      realbuiltinvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  
-  
-  // intbuiltinvec wrappers
-  
-  template<typename real_t, int size>
-  inline intbuiltinvec<real_t, size> abs(intbuiltinvec<real_t, size> x)
-  {
-    return x.abs();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> as_bool(intbuiltinvec<real_t, size> x)
-  {
-    return x.as_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> as_float(intbuiltinvec<real_t, size> x)
-  {
-    return x.as_float();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> bitifthen(intbuiltinvec<real_t, size> x,
-                                        intbuiltinvec<real_t, size> y,
-                                        intbuiltinvec<real_t, size> z)
-  {
-    return x.bitifthen(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline intbuiltinvec<real_t, size> clz(intbuiltinvec<real_t, size> x)
-  {
-    return x.clz();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> convert_bool(intbuiltinvec<real_t, size> x)
-  {
-    return x.convert_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> convert_float(intbuiltinvec<real_t, size> x)
-  {
-    return x.convert_float();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> isignbit(intbuiltinvec<real_t, size> x)
-  {
-    return x.isignbit();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> lsr(intbuiltinvec<real_t, size> x,
-                                  typename intbuiltinvec<real_t, size>::int_t n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> lsr(intbuiltinvec<real_t, size> x,
-                                  intbuiltinvec<real_t, size> n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> max(intbuiltinvec<real_t, size> x,
-                                  intbuiltinvec<real_t, size> y)
-  {
-    return x.max(y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> min(intbuiltinvec<real_t, size> x,
-                                  intbuiltinvec<real_t, size> y)
-  {
-    return x.min(y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> popcount(intbuiltinvec<real_t, size> x)
-  {
-    return x.popcount();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size>
-  rotate(intbuiltinvec<real_t, size> x,
-         typename intbuiltinvec<real_t, size>::int_t n)
-  {
-    return x.rotate(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intbuiltinvec<real_t, size> rotate(intbuiltinvec<real_t, size> x,
-                                     intbuiltinvec<real_t, size> n)
-  {
-    return x.rotate(n);
-  }
-  
-  
-  
-  // realbuiltinvec wrappers
-  
-  template<typename real_t, int size>
-  inline
-  realbuiltinvec<real_t, size>
-  loada(real_t const* p,
-        realbuiltinvec<real_t, size> x,
-        typename realbuiltinvec<real_t, size>::mask_t const& m)
-  {
-    return x.loada(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size>
-  loadu(real_t const* p,
-        realbuiltinvec<real_t, size> x,
-        typename realbuiltinvec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realbuiltinvec<real_t, size>
-  loadu(real_t const* p, size_t ioff,
-        realbuiltinvec<real_t, size> x,
-        typename realbuiltinvec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, ioff, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realbuiltinvec<real_t, size> x, real_t* p)
-  {
-    return x.storea(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realbuiltinvec<real_t, size> x, real_t* p)
-  {
-    return x.storeu(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realbuiltinvec<real_t, size> x, real_t* p, size_t ioff)
-  {
-    return x.storeu(p, ioff);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realbuiltinvec<real_t, size> x, real_t* p,
-                     typename realbuiltinvec<real_t, size>::mask_t const& m)
-  {
-    return x.storea(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realbuiltinvec<real_t, size> x, real_t* p,
-                     typename realbuiltinvec<real_t, size>::mask_t const& m)
-  {
-    return x.storeu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realbuiltinvec<real_t, size> x, real_t* p, size_t ioff,
-                     typename realbuiltinvec<real_t, size>::mask_t const& m)
-  {
-    return x.storeu(p, ioff, m);
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline intbuiltinvec<real_t, size> as_int(realbuiltinvec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline intbuiltinvec<real_t, size> convert_int(realbuiltinvec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t maxval(realbuiltinvec<real_t, size> x)
-  {
-    return x.maxval();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t minval(realbuiltinvec<real_t, size> x)
-  {
-    return x.minval();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t prod(realbuiltinvec<real_t, size> x)
-  {
-    return x.prod();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t sum(realbuiltinvec<real_t, size> x)
-  {
-    return x.sum();
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> acos(realbuiltinvec<real_t, size> x)
-  {
-    return x.acos();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> acosh(realbuiltinvec<real_t, size> x)
-  {
-    return x.acosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> asin(realbuiltinvec<real_t, size> x)
-  {
-    return x.asin();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> asinh(realbuiltinvec<real_t, size> x)
-  {
-    return x.asinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> atan(realbuiltinvec<real_t, size> x)
-  {
-    return x.atan();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> atan2(realbuiltinvec<real_t, size> x,
-                                            realbuiltinvec<real_t, size> y)
-  {
-    return x.atan2(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> atanh(realbuiltinvec<real_t, size> x)
-  {
-    return x.atanh();
-  }
-    
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> cbrt(realbuiltinvec<real_t, size> x)
-  {
-    return x.cbrt();
-  }
-    
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> ceil(realbuiltinvec<real_t, size> x)
-  {
-    return x.ceil();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> copysign(realbuiltinvec<real_t, size> x,
-                                               realbuiltinvec<real_t, size> y)
-  {
-    return x.copysign(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> cos(realbuiltinvec<real_t, size> x)
-  {
-    return x.cos();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> cosh(realbuiltinvec<real_t, size> x)
-  {
-    return x.cosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> exp(realbuiltinvec<real_t, size> x)
-  {
-    return x.exp();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> exp10(realbuiltinvec<real_t, size> x)
-  {
-    return x.exp10();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> exp2(realbuiltinvec<real_t, size> x)
-  {
-    return x.exp2();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> expm1(realbuiltinvec<real_t, size> x)
-  {
-    return x.expm1();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> fabs(realbuiltinvec<real_t, size> x)
-  {
-    return x.fabs();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> floor(realbuiltinvec<real_t, size> x)
-  {
-    return x.floor();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> fdim(realbuiltinvec<real_t, size> x,
-                                        realbuiltinvec<real_t, size> y)
-  {
-    return x.fdim(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> fma(realbuiltinvec<real_t, size> x,
-                                          realbuiltinvec<real_t, size> y,
-                                          realbuiltinvec<real_t, size> z)
-  {
-    return x.fma(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> fmax(realbuiltinvec<real_t, size> x,
-                                           realbuiltinvec<real_t, size> y)
-  {
-    return x.fmax(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> fmin(realbuiltinvec<real_t, size> x,
-                                           realbuiltinvec<real_t, size> y)
-  {
-    return x.fmin(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> fmod(realbuiltinvec<real_t, size> x,
-                                           realbuiltinvec<real_t, size> y)
-  {
-    return x.fmod(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> frexp(realbuiltinvec<real_t, size> x,
-                                            intbuiltinvec<real_t, size>* r)
-  {
-    return x.frexp(r);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> hypot(realbuiltinvec<real_t, size> x,
-                                            realbuiltinvec<real_t, size> y)
-  {
-    return x.hypot(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intbuiltinvec<real_t, size> ilogb(realbuiltinvec<real_t, size> x)
-  {
-    return x.ilogb();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> isfinite(realbuiltinvec<real_t, size> x)
-  {
-    return x.isfinite();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> isinf(realbuiltinvec<real_t, size> x)
-  {
-    return x.isinf();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> isnan(realbuiltinvec<real_t, size> x)
-  {
-    return x.isnan();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> isnormal(realbuiltinvec<real_t, size> x)
-  {
-    return x.isnormal();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realbuiltinvec<real_t, size>
-  ldexp(realbuiltinvec<real_t, size> x,
-        typename intbuiltinvec<real_t, size>::int_t n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realbuiltinvec<real_t, size> ldexp(realbuiltinvec<real_t, size> x,
-                                     intbuiltinvec<real_t, size> n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> log(realbuiltinvec<real_t, size> x)
-  {
-    return x.log();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> log10(realbuiltinvec<real_t, size> x)
-  {
-    return x.log10();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> log1p(realbuiltinvec<real_t, size> x)
-  {
-    return x.log1p();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> log2(realbuiltinvec<real_t, size> x)
-  {
-    return x.log2();
-  }
-  
-  template<typename real_t, int size>
-  inline intbuiltinvec<real_t, size> lrint(realbuiltinvec<real_t, size> x)
-  {
-    return x.lrint();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> mad(realbuiltinvec<real_t, size> x,
-                                          realbuiltinvec<real_t, size> y,
-                                          realbuiltinvec<real_t, size> z)
-  {
-    return x.mad(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> nextafter(realbuiltinvec<real_t, size> x,
-                                                realbuiltinvec<real_t, size> y)
-  {
-    return x.nextafter(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> pow(realbuiltinvec<real_t, size> x,
-                                          realbuiltinvec<real_t, size> y)
-  {
-    return x.pow(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> rcp(realbuiltinvec<real_t, size> x)
-  {
-    return x.rcp();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> remainder(realbuiltinvec<real_t, size> x,
-                                                realbuiltinvec<real_t, size> y)
-  {
-    return x.remainder(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> rint(realbuiltinvec<real_t, size> x)
-  {
-    return x.rint();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> round(realbuiltinvec<real_t, size> x)
-  {
-    return x.round();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> rsqrt(realbuiltinvec<real_t, size> x)
-  {
-    return x.rsqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline boolbuiltinvec<real_t, size> signbit(realbuiltinvec<real_t, size> x)
-  {
-    return x.signbit();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> sin(realbuiltinvec<real_t, size> x)
-  {
-    return x.sin();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> sinh(realbuiltinvec<real_t, size> x)
-  {
-    return x.sinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> sqrt(realbuiltinvec<real_t, size> x)
-  {
-    return x.sqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> tan(realbuiltinvec<real_t, size> x)
-  {
-    return x.tan();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> tanh(realbuiltinvec<real_t, size> x)
-  {
-    return x.tanh();
-  }
-  
-  template<typename real_t, int size>
-  inline realbuiltinvec<real_t, size> trunc(realbuiltinvec<real_t, size> x)
-  {
-    return x.trunc();
-  }
-  
-  
-  
-#ifndef VML_NO_IOSTREAM
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           boolbuiltinvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           intbuiltinvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           realbuiltinvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
+  realvec_t log() const { return map(builtin_log); }
+  realvec_t log10() const { return map(builtin_log10); }
+  realvec_t log1p() const { return map(builtin_log1p); }
+  realvec_t log2() const { return map(builtin_log2); }
+  intvec_t lrint() const {
+    if (sizeof(int_t) <= sizeof(long)) {
+      return map(builtin_lrint);
+    } else if (sizeof(int_t) <= sizeof(long long)) {
+      return map(builtin_llrint);
+    }
+    __builtin_unreachable();
+  }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return map(builtin_nextafter, y); }
+  realvec_t pow(realvec_t y) const { return map(builtin_pow, y); }
+  realvec_t rcp() const { return RV(1.0) / *this; }
+  realvec_t remainder(realvec_t y) const { return map(builtin_remainder, y); }
+  realvec_t rint() const { return map(builtin_rint); }
+  realvec_t round() const { return map(builtin_round); }
+  realvec_t rsqrt() const { return RV(1.0) / sqrt(); }
+  boolvec_t signbit() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d) {
+      res.set_elt(d, builtin_signbit((*this)[d]) != 0);
     }
-    os << "]";
-    return os;
+    return res;
   }
+  realvec_t sin() const { return map(builtin_sin); }
+  realvec_t sinh() const { return map(builtin_sinh); }
+  realvec_t sqrt() const { return map(builtin_sqrt); }
+  realvec_t tan() const { return map(builtin_tan); }
+  realvec_t tanh() const { return map(builtin_tanh); }
+  realvec_t trunc() const { return map(builtin_trunc); }
+};
+
+// boolbuiltinvec definitions
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::intvec_t
+boolbuiltinvec<T, N>::as_int() const {
+  intvec_t res;
+  std::memcpy(&res.v, &v, sizeof res.v);
+  return res;
+}
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::intvec_t
+boolbuiltinvec<T, N>::convert_int() const {
+  return -as_int();
+}
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::boolvec_t
+boolbuiltinvec<T, N>::ifthen(boolvec_t x, boolvec_t y) const {
+  // return v ? x.v : y.v;
+  boolvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+  return res;
+}
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::intvec_t
+boolbuiltinvec<T, N>::ifthen(intvec_t x, intvec_t y) const {
+  // return v ? x.v : y.v;
+  intvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+  return res;
+}
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::realvec_t
+boolbuiltinvec<T, N>::ifthen(realvec_t x, realvec_t y) const {
+  // return v ? x.v : y.v;
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+  return res;
+}
+
+// intbuiltinvec definitions
+
+template <typename T, int N>
+inline typename intbuiltinvec<T, N>::realvec_t
+intbuiltinvec<T, N>::as_float() const {
+  realvec_t res;
+  std::memcpy(&res.v, &v, sizeof res.v);
+  return res;
+}
+
+template <typename T, int N>
+inline typename intbuiltinvec<T, N>::realvec_t
+intbuiltinvec<T, N>::convert_float() const {
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.set_elt(d, real_t((*this)[d]));
+  return res;
+}
+
+// Wrappers
+
+// boolbuiltinvec wrappers
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> as_int(boolbuiltinvec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> convert_int(boolbuiltinvec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline bool all(boolbuiltinvec<real_t, size> x) {
+  return x.all();
+}
+
+template <typename real_t, int size>
+inline bool any(boolbuiltinvec<real_t, size> x) {
+  return x.any();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
+                                           boolbuiltinvec<real_t, size> x,
+                                           boolbuiltinvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
+                                          intbuiltinvec<real_t, size> x,
+                                          intbuiltinvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
+                                           realbuiltinvec<real_t, size> x,
+                                           realbuiltinvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+// intbuiltinvec wrappers
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> abs(intbuiltinvec<real_t, size> x) {
+  return x.abs();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> as_bool(intbuiltinvec<real_t, size> x) {
+  return x.as_bool();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> as_float(intbuiltinvec<real_t, size> x) {
+  return x.as_float();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> bitifthen(intbuiltinvec<real_t, size> x,
+                                             intbuiltinvec<real_t, size> y,
+                                             intbuiltinvec<real_t, size> z) {
+  return x.bitifthen(y, z);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> clz(intbuiltinvec<real_t, size> x) {
+  return x.clz();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size>
+convert_bool(intbuiltinvec<real_t, size> x) {
+  return x.convert_bool();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+convert_float(intbuiltinvec<real_t, size> x) {
+  return x.convert_float();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isignbit(intbuiltinvec<real_t, size> x) {
+  return x.isignbit();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size>
+lsr(intbuiltinvec<real_t, size> x,
+    typename intbuiltinvec<real_t, size>::int_t n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> lsr(intbuiltinvec<real_t, size> x,
+                                       intbuiltinvec<real_t, size> n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> max(intbuiltinvec<real_t, size> x,
+                                       intbuiltinvec<real_t, size> y) {
+  return x.max(y);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> min(intbuiltinvec<real_t, size> x,
+                                       intbuiltinvec<real_t, size> y) {
+  return x.min(y);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> popcount(intbuiltinvec<real_t, size> x) {
+  return x.popcount();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size>
+rotate(intbuiltinvec<real_t, size> x,
+       typename intbuiltinvec<real_t, size>::int_t n) {
+  return x.rotate(n);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> rotate(intbuiltinvec<real_t, size> x,
+                                          intbuiltinvec<real_t, size> n) {
+  return x.rotate(n);
+}
+
+// realbuiltinvec wrappers
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+loada(real_t const *p, realbuiltinvec<real_t, size> x,
+      typename realbuiltinvec<real_t, size>::mask_t const &m) {
+  return x.loada(p, m);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+loadu(real_t const *p, realbuiltinvec<real_t, size> x,
+      typename realbuiltinvec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, m);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+loadu(real_t const *p, size_t ioff, realbuiltinvec<real_t, size> x,
+      typename realbuiltinvec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline void storea(realbuiltinvec<real_t, size> x, real_t *p) {
+  return x.storea(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realbuiltinvec<real_t, size> x, real_t *p) {
+  return x.storeu(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realbuiltinvec<real_t, size> x, real_t *p, size_t ioff) {
+  return x.storeu(p, ioff);
+}
+
+template <typename real_t, int size>
+inline void storea(realbuiltinvec<real_t, size> x, real_t *p,
+                   typename realbuiltinvec<real_t, size>::mask_t const &m) {
+  return x.storea(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realbuiltinvec<real_t, size> x, real_t *p,
+                   typename realbuiltinvec<real_t, size>::mask_t const &m) {
+  return x.storeu(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realbuiltinvec<real_t, size> x, real_t *p, size_t ioff,
+                   typename realbuiltinvec<real_t, size>::mask_t const &m) {
+  return x.storeu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> as_int(realbuiltinvec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> convert_int(realbuiltinvec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline real_t maxval(realbuiltinvec<real_t, size> x) {
+  return x.maxval();
+}
+
+template <typename real_t, int size>
+inline real_t minval(realbuiltinvec<real_t, size> x) {
+  return x.minval();
+}
+
+template <typename real_t, int size>
+inline real_t prod(realbuiltinvec<real_t, size> x) {
+  return x.prod();
+}
+
+template <typename real_t, int size>
+inline real_t sum(realbuiltinvec<real_t, size> x) {
+  return x.sum();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> acos(realbuiltinvec<real_t, size> x) {
+  return x.acos();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> acosh(realbuiltinvec<real_t, size> x) {
+  return x.acosh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> asin(realbuiltinvec<real_t, size> x) {
+  return x.asin();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> asinh(realbuiltinvec<real_t, size> x) {
+  return x.asinh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> atan(realbuiltinvec<real_t, size> x) {
+  return x.atan();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> atan2(realbuiltinvec<real_t, size> x,
+                                          realbuiltinvec<real_t, size> y) {
+  return x.atan2(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> atanh(realbuiltinvec<real_t, size> x) {
+  return x.atanh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> cbrt(realbuiltinvec<real_t, size> x) {
+  return x.cbrt();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> ceil(realbuiltinvec<real_t, size> x) {
+  return x.ceil();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> copysign(realbuiltinvec<real_t, size> x,
+                                             realbuiltinvec<real_t, size> y) {
+  return x.copysign(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> cos(realbuiltinvec<real_t, size> x) {
+  return x.cos();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> cosh(realbuiltinvec<real_t, size> x) {
+  return x.cosh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> exp(realbuiltinvec<real_t, size> x) {
+  return x.exp();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> exp10(realbuiltinvec<real_t, size> x) {
+  return x.exp10();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> exp2(realbuiltinvec<real_t, size> x) {
+  return x.exp2();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> expm1(realbuiltinvec<real_t, size> x) {
+  return x.expm1();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fabs(realbuiltinvec<real_t, size> x) {
+  return x.fabs();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> floor(realbuiltinvec<real_t, size> x) {
+  return x.floor();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fdim(realbuiltinvec<real_t, size> x,
+                                         realbuiltinvec<real_t, size> y) {
+  return x.fdim(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fma(realbuiltinvec<real_t, size> x,
+                                        realbuiltinvec<real_t, size> y,
+                                        realbuiltinvec<real_t, size> z) {
+  return x.fma(y, z);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fmax(realbuiltinvec<real_t, size> x,
+                                         realbuiltinvec<real_t, size> y) {
+  return x.fmax(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fmin(realbuiltinvec<real_t, size> x,
+                                         realbuiltinvec<real_t, size> y) {
+  return x.fmin(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fmod(realbuiltinvec<real_t, size> x,
+                                         realbuiltinvec<real_t, size> y) {
+  return x.fmod(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> frexp(realbuiltinvec<real_t, size> x,
+                                          intbuiltinvec<real_t, size> *r) {
+  return x.frexp(r);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> hypot(realbuiltinvec<real_t, size> x,
+                                          realbuiltinvec<real_t, size> y) {
+  return x.hypot(y);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> ilogb(realbuiltinvec<real_t, size> x) {
+  return x.ilogb();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isfinite(realbuiltinvec<real_t, size> x) {
+  return x.isfinite();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isinf(realbuiltinvec<real_t, size> x) {
+  return x.isinf();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isnan(realbuiltinvec<real_t, size> x) {
+  return x.isnan();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isnormal(realbuiltinvec<real_t, size> x) {
+  return x.isnormal();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+ldexp(realbuiltinvec<real_t, size> x,
+      typename intbuiltinvec<real_t, size>::int_t n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> ldexp(realbuiltinvec<real_t, size> x,
+                                          intbuiltinvec<real_t, size> n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> log(realbuiltinvec<real_t, size> x) {
+  return x.log();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> log10(realbuiltinvec<real_t, size> x) {
+  return x.log10();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> log1p(realbuiltinvec<real_t, size> x) {
+  return x.log1p();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> log2(realbuiltinvec<real_t, size> x) {
+  return x.log2();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> lrint(realbuiltinvec<real_t, size> x) {
+  return x.lrint();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> mad(realbuiltinvec<real_t, size> x,
+                                        realbuiltinvec<real_t, size> y,
+                                        realbuiltinvec<real_t, size> z) {
+  return x.mad(y, z);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> nextafter(realbuiltinvec<real_t, size> x,
+                                              realbuiltinvec<real_t, size> y) {
+  return x.nextafter(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> pow(realbuiltinvec<real_t, size> x,
+                                        realbuiltinvec<real_t, size> y) {
+  return x.pow(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> rcp(realbuiltinvec<real_t, size> x) {
+  return x.rcp();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> remainder(realbuiltinvec<real_t, size> x,
+                                              realbuiltinvec<real_t, size> y) {
+  return x.remainder(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> rint(realbuiltinvec<real_t, size> x) {
+  return x.rint();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> round(realbuiltinvec<real_t, size> x) {
+  return x.round();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> rsqrt(realbuiltinvec<real_t, size> x) {
+  return x.rsqrt();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> signbit(realbuiltinvec<real_t, size> x) {
+  return x.signbit();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> sin(realbuiltinvec<real_t, size> x) {
+  return x.sin();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> sinh(realbuiltinvec<real_t, size> x) {
+  return x.sinh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> sqrt(realbuiltinvec<real_t, size> x) {
+  return x.sqrt();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> tan(realbuiltinvec<real_t, size> x) {
+  return x.tan();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> tanh(realbuiltinvec<real_t, size> x) {
+  return x.tanh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> trunc(realbuiltinvec<real_t, size> x) {
+  return x.trunc();
+}
+
+#ifndef VML_NO_IOSTREAM
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+                         boolbuiltinvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+                         intbuiltinvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+                         realbuiltinvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
 #endif
-  
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_BUILTIN_H
+#endif // #ifndef VEC_BUILTIN_H
diff --git a/vec_mask.h b/vec_mask.h
index 6f8c996..053e43a 100644
--- a/vec_mask.h
+++ b/vec_mask.h
@@ -5,74 +5,67 @@
 
 #include <cstdlib>
 
+namespace vecmathlib {
 
+template <typename realvec_t> class mask_t {
 
-namespace vecmathlib {
-  
-  template<typename realvec_t>
-  class mask_t {
-    
-    typedef typename realvec_t::boolvec_t boolvec_t;
-    typedef typename realvec_t::intvec_t intvec_t;
-    static const int size = realvec_t::size;
-    
-  public:
-    std::ptrdiff_t imin, imax;
-    std::ptrdiff_t i;
-    boolvec_t m;
-    bool all_m;
-    
-  public:
-    
-    // Construct a mask from a boolvec
-    mask_t(boolvec_t m_): m(m_), all_m(all(m)) {}
-    
-    // Construct a mask for a particular location i
-    mask_t(std::ptrdiff_t i_,
-           std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff):
-      imin(imin_), imax(imax_), i(i_)
-    {
-      all_m = i-imin >= 0 && i+size-1-imax < 0;
-      if (__builtin_expect(all_m, true)) {
-        m = true;
-      } else {
-        m = (! isignbit(intvec_t(i          - imin) + intvec_t::iota()) &&
-               isignbit(intvec_t(i + size-1 - imax) + intvec_t::iota()));
-      }
+  typedef typename realvec_t::boolvec_t boolvec_t;
+  typedef typename realvec_t::intvec_t intvec_t;
+  static const int size = realvec_t::size;
+
+public:
+  std::ptrdiff_t imin, imax;
+  std::ptrdiff_t i;
+  boolvec_t m;
+  bool all_m;
+
+public:
+  // Construct a mask from a boolvec
+  mask_t(boolvec_t m_) : m(m_), all_m(all(m)) {}
+
+  // Construct a mask for a particular location i
+  mask_t(std::ptrdiff_t i_, std::ptrdiff_t imin_, std::ptrdiff_t imax_,
+         std::ptrdiff_t ioff)
+      : imin(imin_), imax(imax_), i(i_) {
+    all_m = i - imin >= 0 && i + size - 1 - imax < 0;
+    if (__builtin_expect(all_m, true)) {
+      m = true;
+    } else {
+      m = (!isignbit(intvec_t(i - imin) + intvec_t::iota()) &&
+           isignbit(intvec_t(i + size - 1 - imax) + intvec_t::iota()));
     }
-    
-    // Construct a mask for a loop starting at imin, aligned down
-    mask_t(std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff):
-      imin(imin_), imax(imax_), i(imin_ - (ioff + imin_) % size)
-    {
-      all_m = i-imin >= 0 && i+size-1-imax < 0;
-      if (__builtin_expect(all_m, true)) {
-        m = true;
-      } else {
-        m = (! isignbit(intvec_t(i          - imin) + intvec_t::iota()) &&
-               isignbit(intvec_t(i + size-1 - imax) + intvec_t::iota()));
-      }
+  }
+
+  // Construct a mask for a loop starting at imin, aligned down
+  mask_t(std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff)
+      : imin(imin_), imax(imax_), i(imin_ - (ioff + imin_) % size) {
+    all_m = i - imin >= 0 && i + size - 1 - imax < 0;
+    if (__builtin_expect(all_m, true)) {
+      m = true;
+    } else {
+      m = (!isignbit(intvec_t(i - imin) + intvec_t::iota()) &&
+           isignbit(intvec_t(i + size - 1 - imax) + intvec_t::iota()));
     }
-    
-    // Get current index
-    std::ptrdiff_t index() const { return i; }
-    
-    // Looping condition
-    operator bool() const { return i<imax; }
-    
-    // Loop stepper
-    void operator++()
-    {
-      i += size;
-      all_m = i + size-1 - imax < 0;
-      if (__builtin_expect(all_m, true)) {
-        m = true;
-      } else {
-        m = isignbit(intvec_t(i + size-1 - imax) + intvec_t::iota());
-      }
+  }
+
+  // Get current index
+  std::ptrdiff_t index() const { return i; }
+
+  // Looping condition
+  operator bool() const { return i < imax; }
+
+  // Loop stepper
+  void operator++() {
+    i += size;
+    all_m = i + size - 1 - imax < 0;
+    if (__builtin_expect(all_m, true)) {
+      m = true;
+    } else {
+      m = isignbit(intvec_t(i + size - 1 - imax) + intvec_t::iota());
     }
-  };
-  
+  }
+};
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_MASK_H
+#endif // #ifndef VEC_MASK_H
diff --git a/vec_mic_double8.h b/vec_mic_double8.h
index 68dd5aa..ef22088 100644
--- a/vec_mic_double8.h
+++ b/vec_mic_double8.h
@@ -12,697 +12,585 @@
 // MIC intrinsics
 #include <immintrin.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_DOUBLE_8
-  template<> struct boolvec<double,8>;
-  template<> struct intvec<double,8>;
-  template<> struct realvec<double,8>;
-  
-  
-  
-  template<>
-  struct boolvec<double,8>: floatprops<double>
-  {
-    static const int size = 8;
-    typedef bool scalar_t;
-    typedef __mask8 bvector_t;
-    static const int alignment = sizeof(bvector_t);
-    
-    // static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-    //               "vector size is wrong");
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(const boolvec& x): v(x.v) {}
-    // boolvec& operator=(const boolvec& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(- bvector_t(a)) {}
-    boolvec(const bool* as):
-      v((bvector_t(as[0]) << 0) |
-        (bvector_t(as[1]) << 1) |
-        (bvector_t(as[2]) << 2) |
-        (bvector_t(as[3]) << 3) |
-        (bvector_t(as[4]) << 4) |
-        (bvector_t(as[5]) << 5) |
-        (bvector_t(as[6]) << 6) |
-        (bvector_t(as[7]) << 7))
-    {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return (v >> n) & 1;
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      v &= ~ (bvector_t(1) << n);
-      v |= bvector_t(a) << n;
-      return *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return _mm512_knot(v); }
-    
-    boolvec operator&&(boolvec x) const { return _mm512_kand(v, x.v); }
-    boolvec operator||(boolvec x) const { return _mm512_kor(v, x.v); }
-    boolvec operator==(boolvec x) const { return _mm512_kxnor(v, x.v); }
-    boolvec operator!=(boolvec x) const { return _mm512_kxor(v, x.v); }
-    
-    bool all() const { return _mm512_kortestc(v, v); }
-    bool any() const { return ! bool(_mm512_kortestz(v, v)); }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,8>: floatprops<double>
-  {
-    static const int size = 8;
-    typedef int_t scalar_t;
-    typedef __m512i ivector_t;
-    static const int alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(const intvec& x): v(x.v) {}
-    // intvec& operator=(const intvec& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm512_set1_epi64(a)) {}
-    intvec(const int_t* as)
-    {
-      v = _mm512_undefined_epi32();
-      // v = _mm512_loadunpacklo_epi32(v, as);
-      // v = _mm512_loadunpackhi_epi32(v, as+8);
-      for (int n=0; n<size; ++n) set_elt(n, as[n]);
-    }
-    static intvec iota()
-    {
-      intvec r;
-      for (int n=0; n<size; ++n) r.set_elt(n, n);
-      return r;
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-  private:
-    static __mmask8 mask16tomask8(__mmask16 m16)
-    {
-      // combine 01
-      m16 = ((m16 >> 1) | m16) & 0b0011001100110011;
-      // combine 0123
-      m16 = ((m16 >> 2) | m16) & 0b0000111100001111;
-      // combine 01234567
-      m16 = ((m16 >> 4) | m16) & 0b0000000011111111;
-      return m16;
-    }
-  public:
-    boolvec_t as_bool() const { return convert_bool(); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      __mmask16 r16 = _mm512_test_epi32_mask(v, v);
-      return mask16tomask8(r16);
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return IV(I(0)) - *this; }
-    intvec operator+(intvec x) const { return _mm512_add_epi64(v, x.v); }
-    intvec operator-(intvec x) const { return _mm512_sub_epi64(v, x.v); }
-    
-    intvec& operator+=(const intvec& x) { return *this=*this+x; }
-    intvec& operator-=(const intvec& x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const { return IV(~U(0)) ^ *this; }
-    intvec operator&(intvec x) const { return _mm512_and_epi64(v, x.v); }
-    intvec operator|(intvec x) const { return _mm512_or_epi64(v, x.v); }
-    intvec operator^(intvec x) const { return _mm512_xor_epi64(v, x.v); }
-    
-    intvec& operator&=(const intvec& x) { return *this=*this&x; }
-    intvec& operator|=(const intvec& x) { return *this=*this|x; }
-    intvec& operator^=(const intvec& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec lsr(int_t n) const
-    {
-      if (n < 32) {
-        __m512i vlo = _mm512_srli_epi32(v, n);
-        __m512i vhi = _mm512_slli_epi32(v, 32-n);
-        vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB);
-        return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo);
-      } else {
-        __m512i vlo = _mm512_srli_epi32(v, n-32);
-        __m512i vhi = _mm512_setzero_epi32();
-        return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo);
-      }
-    }
-    intvec_t rotate(int_t n) const;
-    intvec operator>>(int_t n) const
-    {
-      if (n < 32) {
-        __mm512i vlo = _mm512_srai_epi32(v, n);
-        __mm512i vlo0 = _mm512_srli_epi32(v, n);
-        __mm512i vhi = _mm512_slli_epi32(v, 32-n);
-        vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB);
-        return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo0);
-      } else {
-        __m512i vlo = _mm512_srai_epi32(v, n-32);
-        __m512i vhi = _mm512_srai_epi32(v, 31);
-        return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo);
-      }
-    }
-    intvec operator<<(int_t n) const
-    {
-      if (n < 32) {
-        __m512i vlo = _mm512_srli_epi32(v, n);
-        __m512i vhi = _mm512_slli_epi32(v, 32-n);
-        vlo = _mm512_swizzle_epi32(vlo, _MM_SWIZ_REG_CDAB);
-        return _mm512_mask_or_epi32(vhi, 0xb1010101010101010, vhi, vlo);
-      } else {
-        __m512i vlo = _mm512_setzero_epi32();
-        __m512i vhi = _mm512_slli_epi32(v, n-32);
-        return _mm512_mask_swizzle_epi32(vhi, 0xb1010101010101010, vlo);
-      }
-    }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      // TODO: improve this
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec operator>>(intvec n) const
-    {
-      // TODO: improve this
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      // TODO: improve this
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    intvec_t clz() const
-    {
-      // Return 8*sizeof(TYPE) when the input is 0
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        // __lzcnt64
-        r.set_elt(i, __builtin_clzll((*this)[i]));
-      }
-      return r;
-    }
-    intvec_t popcount() const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        // _mm_popcnt_u64
-        r.set_elt(i, __builtin_popcountll((*this)[i]));
-      }
-      return r;
-    }
-    
-    
-    
-    boolvec_t operator==(const intvec& x) const
-    {
-      return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_EQ));
-    }
-    boolvec_t operator!=(const intvec& x) const
-    {
-      return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_NE));
-    }
-    boolvec_t operator<(const intvec& x) const
-    {
-      return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LT));
-    }
-    boolvec_t operator<=(const intvec& x) const
-    {
-      return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LE));
-    }
-    boolvec_t operator>(const intvec& x) const
-    {
-      return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GT));
-    }
-    boolvec_t operator>=(const intvec& x) const
-    {
-      return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GE));
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const;
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,8>: floatprops<double>
-  {
-    static const int size = 8;
-    typedef real_t scalar_t;
-    typedef __m512d vector_t;
-    static const int alignment = sizeof(vector_t);
-    
-    static const char* name() { return "<MIC:8*double>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(const realvec& x): v(x.v) {}
-    // realvec& operator=(const realvec& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm512_set1_pd(a)) {}
-    realvec(const real_t* as)
-    {
-      v = _mm512_undefined_pd();
-      // v = _mm512_loadunpacklo_pd(v, as);
-      // v = _mm512_loadunpackhi_pd(v, as+8);
-      for (int n=0; n<size; ++n) set_elt(n, as[n]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(const real_t* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm512_load_pd(p);
-    }
-    static realvec_t loadu(const real_t* p)
-    {
-      realvec_t r(_mm512_undefined_pd());
-      r.v = _mm512_loadunpacklo_pd(r.v, p);
-      r.v = _mm512_loadunpackhi_pd(r.v, p+8);
-      return r.v;
-    }
-    static realvec_t loadu(const real_t* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(const real_t* p, const mask_t& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm512_mask_load_pd(v, m.m.v, p);
+template <> struct boolvec<double, 8>;
+template <> struct intvec<double, 8>;
+template <> struct realvec<double, 8>;
+
+template <> struct boolvec<double, 8> : floatprops<double> {
+  static const int size = 8;
+  typedef bool scalar_t;
+  typedef __mask8 bvector_t;
+  static const int alignment = sizeof(bvector_t);
+
+  // static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+  //               "vector size is wrong");
+
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(const boolvec& x): v(x.v) {}
+  // boolvec& operator=(const boolvec& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(-bvector_t(a)) {}
+  boolvec(const bool *as)
+      : v((bvector_t(as[0]) << 0) | (bvector_t(as[1]) << 1) |
+          (bvector_t(as[2]) << 2) | (bvector_t(as[3]) << 3) |
+          (bvector_t(as[4]) << 4) | (bvector_t(as[5]) << 5) |
+          (bvector_t(as[6]) << 6) | (bvector_t(as[7]) << 7)) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const { return (v >> n) & 1; }
+  boolvec &set_elt(int n, bool a) {
+    v &= ~(bvector_t(1) << n);
+    v |= bvector_t(a) << n;
+    return *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return _mm512_knot(v); }
+
+  boolvec operator&&(boolvec x) const { return _mm512_kand(v, x.v); }
+  boolvec operator||(boolvec x) const { return _mm512_kor(v, x.v); }
+  boolvec operator==(boolvec x) const { return _mm512_kxnor(v, x.v); }
+  boolvec operator!=(boolvec x) const { return _mm512_kxor(v, x.v); }
+
+  bool all() const { return _mm512_kortestc(v, v); }
+  bool any() const { return !bool(_mm512_kortestz(v, v)); }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 8> : floatprops<double> {
+  static const int size = 8;
+  typedef int_t scalar_t;
+  typedef __m512i ivector_t;
+  static const int alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(const intvec& x): v(x.v) {}
+  // intvec& operator=(const intvec& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm512_set1_epi64(a)) {}
+  intvec(const int_t *as) {
+    v = _mm512_undefined_epi32();
+    // v = _mm512_loadunpacklo_epi32(v, as);
+    // v = _mm512_loadunpackhi_epi32(v, as+8);
+    for (int n = 0; n < size; ++n)
+      set_elt(n, as[n]);
+  }
+  static intvec iota() {
+    intvec r;
+    for (int n = 0; n < size; ++n)
+      r.set_elt(n, n);
+    return r;
+  }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+private:
+  static __mmask8 mask16tomask8(__mmask16 m16) {
+    // combine 01
+    m16 = ((m16 >> 1) | m16) & 0b0011001100110011;
+    // combine 0123
+    m16 = ((m16 >> 2) | m16) & 0b0000111100001111;
+    // combine 01234567
+    m16 = ((m16 >> 4) | m16) & 0b0000000011111111;
+    return m16;
+  }
+
+public:
+  boolvec_t as_bool() const { return convert_bool(); }
+  boolvec_t convert_bool() const {
+    // Result: convert_bool(0)=false, convert_bool(else)=true
+    __mmask16 r16 = _mm512_test_epi32_mask(v, v);
+    return mask16tomask8(r16);
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const { return IV(I(0)) - *this; }
+  intvec operator+(intvec x) const { return _mm512_add_epi64(v, x.v); }
+  intvec operator-(intvec x) const { return _mm512_sub_epi64(v, x.v); }
+
+  intvec &operator+=(const intvec &x) { return *this = *this + x; }
+  intvec &operator-=(const intvec &x) { return *this = *this - x; }
+
+  intvec operator~() const { return IV(~U(0)) ^ *this; }
+  intvec operator&(intvec x) const { return _mm512_and_epi64(v, x.v); }
+  intvec operator|(intvec x) const { return _mm512_or_epi64(v, x.v); }
+  intvec operator^(intvec x) const { return _mm512_xor_epi64(v, x.v); }
+
+  intvec &operator&=(const intvec &x) { return *this = *this & x; }
+  intvec &operator|=(const intvec &x) { return *this = *this | x; }
+  intvec &operator^=(const intvec &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec lsr(int_t n) const {
+    if (n < 32) {
+      __m512i vlo = _mm512_srli_epi32(v, n);
+      __m512i vhi = _mm512_slli_epi32(v, 32 - n);
+      vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB);
+      return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo);
+    } else {
+      __m512i vlo = _mm512_srli_epi32(v, n - 32);
+      __m512i vhi = _mm512_setzero_epi32();
+      return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo);
     }
-    realvec_t loadu(const real_t* p, const mask_t& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
+  }
+  intvec_t rotate(int_t n) const;
+  intvec operator>>(int_t n) const {
+    if (n < 32) {
+      __mm512i vlo = _mm512_srai_epi32(v, n);
+      __mm512i vlo0 = _mm512_srli_epi32(v, n);
+      __mm512i vhi = _mm512_slli_epi32(v, 32 - n);
+      vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB);
+      return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo0);
+    } else {
+      __m512i vlo = _mm512_srai_epi32(v, n - 32);
+      __m512i vhi = _mm512_srai_epi32(v, 31);
+      return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo);
     }
-    realvec_t loadu(const real_t* p, std::ptrdiff_t ioff, const mask_t& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
+  }
+  intvec operator<<(int_t n) const {
+    if (n < 32) {
+      __m512i vlo = _mm512_srli_epi32(v, n);
+      __m512i vhi = _mm512_slli_epi32(v, 32 - n);
+      vlo = _mm512_swizzle_epi32(vlo, _MM_SWIZ_REG_CDAB);
+      return _mm512_mask_or_epi32(vhi, 0xb1010101010101010, vhi, vlo);
+    } else {
+      __m512i vlo = _mm512_setzero_epi32();
+      __m512i vhi = _mm512_slli_epi32(v, n - 32);
+      return _mm512_mask_swizzle_epi32(vhi, 0xb1010101010101010, vlo);
     }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm512_store_pd(p, v);
+  }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec lsr(intvec n) const {
+    // TODO: improve this
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    void storeu(real_t* p) const
-    {
-      _mm512_packstorelo_pd(p, v);
-      _mm512_packstorehi_pd(p+8, v);
+    return r;
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec operator>>(intvec n) const {
+    // TODO: improve this
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
+    return r;
+  }
+  intvec operator<<(intvec n) const {
+    // TODO: improve this
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    void storea(real_t* p, const mask_t& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm512_mask_store_pd(p, m.m.v, v);
+    return r;
+  }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  intvec_t clz() const {
+    // Return 8*sizeof(TYPE) when the input is 0
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      // __lzcnt64
+      r.set_elt(i, __builtin_clzll((*this)[i]));
     }
-    void storeu(real_t* p, const mask_t& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        for (int n=0; n<size; ++n) {
-          if (m.m[n]) p[n] = (*this)[n];
-        }
-      }
+    return r;
+  }
+  intvec_t popcount() const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      // _mm_popcnt_u64
+      r.set_elt(i, __builtin_popcountll((*this)[i]));
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff, const mask_t& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
+    return r;
+  }
+
+  boolvec_t operator==(const intvec &x) const {
+    return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_EQ));
+  }
+  boolvec_t operator!=(const intvec &x) const {
+    return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_NE));
+  }
+  boolvec_t operator<(const intvec &x) const {
+    return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LT));
+  }
+  boolvec_t operator<=(const intvec &x) const {
+    return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LE));
+  }
+  boolvec_t operator>(const intvec &x) const {
+    return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GT));
+  }
+  boolvec_t operator>=(const intvec &x) const {
+    return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GE));
+  }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const;
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 8> : floatprops<double> {
+  static const int size = 8;
+  typedef real_t scalar_t;
+  typedef __m512d vector_t;
+  static const int alignment = sizeof(vector_t);
+
+  static const char *name() { return "<MIC:8*double>"; }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(const realvec& x): v(x.v) {}
+  // realvec& operator=(const realvec& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm512_set1_pd(a)) {}
+  realvec(const real_t *as) {
+    v = _mm512_undefined_pd();
+    // v = _mm512_loadunpacklo_pd(v, as);
+    // v = _mm512_loadunpackhi_pd(v, as+8);
+    for (int n = 0; n < size; ++n)
+      set_elt(n, as[n]);
+  }
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(const real_t *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm512_load_pd(p);
+  }
+  static realvec_t loadu(const real_t *p) {
+    realvec_t r(_mm512_undefined_pd());
+    r.v = _mm512_loadunpacklo_pd(r.v, p);
+    r.v = _mm512_loadunpackhi_pd(r.v, p + 8);
+    return r.v;
+  }
+  static realvec_t loadu(const real_t *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(const real_t *p, const mask_t &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm512_mask_load_pd(v, m.m.v, p);
+  }
+  realvec_t loadu(const real_t *p, const mask_t &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    
-    
-    
-    intvec_t as_int() const { return _mm512_castpd_si512(v); }
-    intvec_t convert_int() const
-    {
-      intvec_t r(_mm512_undefined_epi32());
-      for (int n=0; n<size; ++n) {
-        r.set_elt(n, floatprops::convert_int((*this)[n]));
+  }
+  realvec_t loadu(const real_t *p, std::ptrdiff_t ioff, const mask_t &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm512_store_pd(p, v);
+  }
+  void storeu(real_t *p) const {
+    _mm512_packstorelo_pd(p, v);
+    _mm512_packstorehi_pd(p + 8, v);
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, const mask_t &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm512_mask_store_pd(p, m.m.v, v);
+  }
+  void storeu(real_t *p, const mask_t &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      for (int n = 0; n < size; ++n) {
+        if (m.m[n])
+          p[n] = (*this)[n];
       }
-      return r;
-    }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return RV(0.0) - *this; }
-    
-    realvec operator+(realvec x) const { return _mm512_add_pd(v, x.v); }
-    realvec operator-(realvec x) const { return _mm512_sub_pd(v, x.v); }
-    realvec operator*(realvec x) const { return _mm512_mul_pd(v, x.v); }
-    realvec operator/(realvec x) const { return _mm512_div_pd(v, x.v); }
-    
-    realvec& operator+=(const realvec& x) { return *this=*this+x; }
-    realvec& operator-=(const realvec& x) { return *this=*this-x; }
-    realvec& operator*=(const realvec& x) { return *this=*this*x; }
-    realvec& operator/=(const realvec& x) { return *this=*this/x; }
-    
-    real_t maxval() const { returm _mm512_reduce_gmax_pd(v); }
-    real_t minval() const { returm _mm512_reduce_gmin_pd(v); }
-    real_t prod() const { returm _mm512_reduce_mul_pd(v); }
-    real_t sum() const { returm _mm512_reduce_add_pd(v); }
-    
-    
-    
-    boolvec_t operator==(const realvec& x) const
-    {
-      return _mm512_cmp_pd(v, x.v, _CMP_EQ_OQ);
-    }
-    boolvec_t operator!=(const realvec& x) const
-    {
-      return _mm512_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
-    }
-    boolvec_t operator<(const realvec& x) const
-    {
-      return _mm512_cmp_pd(v, x.v, _CMP_LT_OQ);
-    }
-    boolvec_t operator<=(const realvec& x) const
-    {
-      return _mm512_cmp_pd(v, x.v, _CMP_LE_OQ);
-    }
-    boolvec_t operator>(const realvec& x) const
-    {
-      return _mm512_cmp_pd(v, x.v, _CMP_GT_OQ);
     }
-    boolvec_t operator>=(const realvec& x) const
-    {
-      return _mm512_cmp_pd(v, x.v, _CMP_GE_OQ);
-    }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const { return _mm512_ceil_pd(v); }
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return MF::vml_fabs(*this); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return _mm512_floor_pd(v); }
-    realvec fma(realvec y, realvec z) const
-    {
-      return _mm512_fmadd_pd(v, x.v, y.v);
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, const mask_t &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return _mm512_castpd_si512(v); }
+  intvec_t convert_int() const {
+    intvec_t r(_mm512_undefined_epi32());
+    for (int n = 0; n < size; ++n) {
+      r.set_elt(n, floatprops::convert_int((*this)[n]));
     }
-    realvec fmax(realvec y) const { return _mm512_gmax_pd(v, y.v); }
-    realvec fmin(realvec y) const { return _mm512_gmin_pd(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const
-    {
+    return r;
+  }
+
+  realvec operator+() const { return *this; }
+  realvec operator-() const { return RV(0.0) - *this; }
+
+  realvec operator+(realvec x) const { return _mm512_add_pd(v, x.v); }
+  realvec operator-(realvec x) const { return _mm512_sub_pd(v, x.v); }
+  realvec operator*(realvec x) const { return _mm512_mul_pd(v, x.v); }
+  realvec operator/(realvec x) const { return _mm512_div_pd(v, x.v); }
+
+  realvec &operator+=(const realvec &x) { return *this = *this + x; }
+  realvec &operator-=(const realvec &x) { return *this = *this - x; }
+  realvec &operator*=(const realvec &x) { return *this = *this * x; }
+  realvec &operator/=(const realvec &x) { return *this = *this / x; }
+
+  real_t maxval() const { returm _mm512_reduce_gmax_pd(v); }
+  real_t minval() const { returm _mm512_reduce_gmin_pd(v); }
+  real_t prod() const { returm _mm512_reduce_mul_pd(v); }
+  real_t sum() const { returm _mm512_reduce_add_pd(v); }
+
+  boolvec_t operator==(const realvec &x) const {
+    return _mm512_cmp_pd(v, x.v, _CMP_EQ_OQ);
+  }
+  boolvec_t operator!=(const realvec &x) const {
+    return _mm512_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
+  }
+  boolvec_t operator<(const realvec &x) const {
+    return _mm512_cmp_pd(v, x.v, _CMP_LT_OQ);
+  }
+  boolvec_t operator<=(const realvec &x) const {
+    return _mm512_cmp_pd(v, x.v, _CMP_LE_OQ);
+  }
+  boolvec_t operator>(const realvec &x) const {
+    return _mm512_cmp_pd(v, x.v, _CMP_GT_OQ);
+  }
+  boolvec_t operator>=(const realvec &x) const {
+    return _mm512_cmp_pd(v, x.v, _CMP_GE_OQ);
+  }
+
+  realvec acos() const { return MF::vml_acos(*this); }
+  realvec acosh() const { return MF::vml_acosh(*this); }
+  realvec asin() const { return MF::vml_asin(*this); }
+  realvec asinh() const { return MF::vml_asinh(*this); }
+  realvec atan() const { return MF::vml_atan(*this); }
+  realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+  realvec atanh() const { return MF::vml_atanh(*this); }
+  realvec cbrt() const { return MF::vml_cbrt(*this); }
+  realvec ceil() const { return _mm512_ceil_pd(v); }
+  realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+  realvec cos() const { return MF::vml_cos(*this); }
+  realvec cosh() const { return MF::vml_cosh(*this); }
+  realvec exp() const { return MF::vml_exp(*this); }
+  realvec exp10() const { return MF::vml_exp10(*this); }
+  realvec exp2() const { return MF::vml_exp2(*this); }
+  realvec expm1() const { return MF::vml_expm1(*this); }
+  realvec fabs() const { return MF::vml_fabs(*this); }
+  realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+  realvec floor() const { return _mm512_floor_pd(v); }
+  realvec fma(realvec y, realvec z) const {
+    return _mm512_fmadd_pd(v, x.v, y.v);
+  }
+  realvec fmax(realvec y) const { return _mm512_gmax_pd(v, y.v); }
+  realvec fmin(realvec y) const { return _mm512_gmin_pd(v, y.v); }
+  realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+  realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const {
 #ifdef VML_HAVE_NAN
-      return _mm512_cmp_pd(v, v, _CMP_UNORD_Q);
+    return _mm512_cmp_pd(v, v, _CMP_UNORD_Q);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return _mm512_fmadd_pd(v, x.v, y.v);
-    }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const { return _mm512_div_pd(_mm512_set1_pd(1.0), v); }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const
-    {
-      return _mm512_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
-    }
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return as_int().signbit(); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return _mm512_sqrt_pd(v); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const { return _mm512_round_pd(v, _MM_FROUND_TO_ZERO); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<double,4> boolvec<double,4>::as_int() const
-  {
-    return _mm512_castpd_si512(v);
-  }
-  
-  inline intvec<double,4> boolvec<double,4>::convert_int() const
-  {
-    return ifthen(v, IV(I(1)), IV(I(0)));
-  }
-  
-  inline
-  boolvec<double,4> boolvec<double,4>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return (v & x.v) | (~v & y.v);
-  }
-  
-  inline
-  intvec<double,4> boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return _mm512_blend_epi64(v, y.v, x.v)
-  }
-  
-  inline
-  realvec<double,4> boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return _mm512_blend_pd(v, y.v, x.v)
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<double,4> intvec<double,4>::as_float() const
-  {
-    return _mm512_castsi512_pd(v);
-  }
-  
-  inline realvec<double,4> intvec<double,4>::convert_float() const
-  {
-    intvec_t r(_mm512_undefined_pd());
-    for (int n=0; n<size; ++n) {
-      r.set_elt(n, floatprops::convert_float((*this)[n]));
-    }
-    return r;
   }
-  
-  inline intvec<double,8> intvec<double,8>::abs() const
-  {
-    return MF::vml_abs(*this);
-  }
-  
-  inline intvec<double,8> intvec<double,8>::bitifthen(intvec_t x,
-                                                      intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline boolvec<double,8> intvec<double,8>::isignbit() const
-  {
-    return MF::vml_isignbit(*this);
-  }
-  
-  inline intvec<double,8> intvec<double,8>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
-  }
-  
-  inline intvec<double,8> intvec<double,8>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
-  }
-  
-  inline intvec<double,8> intvec<double,8>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<double,8> intvec<double,8>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec log() const { return MF::vml_log(*this); }
+  realvec log10() const { return MF::vml_log10(*this); }
+  realvec log1p() const { return MF::vml_log1p(*this); }
+  realvec log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return _mm512_fmadd_pd(v, x.v, y.v);
+  }
+  realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+  realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+  realvec rcp() const { return _mm512_div_pd(_mm512_set1_pd(1.0), v); }
+  realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+  realvec rint() const { return _mm512_round_pd(v, _MM_FROUND_TO_NEAREST_INT); }
+  realvec round() const { return MF::vml_round(*this); }
+  realvec rsqrt() const { return MF::vml_rsqrt(*this); }
+  boolvec_t signbit() const { return as_int().signbit(); }
+  realvec sin() const { return MF::vml_sin(*this); }
+  realvec sinh() const { return MF::vml_sinh(*this); }
+  realvec sqrt() const { return _mm512_sqrt_pd(v); }
+  realvec tan() const { return MF::vml_tan(*this); }
+  realvec tanh() const { return MF::vml_tanh(*this); }
+  realvec trunc() const { return _mm512_round_pd(v, _MM_FROUND_TO_ZERO); }
+};
+
+// boolvec definitions
+
+inline intvec<double, 4> boolvec<double, 4>::as_int() const {
+  return _mm512_castpd_si512(v);
+}
+
+inline intvec<double, 4> boolvec<double, 4>::convert_int() const {
+  return ifthen(v, IV(I(1)), IV(I(0)));
+}
+
+inline boolvec<double, 4> boolvec<double, 4>::ifthen(boolvec_t x,
+                                                     boolvec_t y) const {
+  return (v & x.v) | (~v & y.v);
+}
+
+inline intvec<double, 4> boolvec<double, 4>::ifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return _mm512_blend_epi64(v, y.v, x.v)
+}
+
+inline realvec<double, 4> boolvec<double, 4>::ifthen(realvec_t x,
+                                                     realvec_t y) const {
+  return _mm512_blend_pd(v, y.v, x.v)
+}
+
+// intvec definitions
+
+inline realvec<double, 4> intvec<double, 4>::as_float() const {
+  return _mm512_castsi512_pd(v);
+}
+
+inline realvec<double, 4> intvec<double, 4>::convert_float() const {
+  intvec_t r(_mm512_undefined_pd());
+  for (int n = 0; n < size; ++n) {
+    r.set_elt(n, floatprops::convert_float((*this)[n]));
+  }
+  return r;
+}
+
+inline intvec<double, 8> intvec<double, 8>::abs() const {
+  return MF::vml_abs(*this);
+}
+
+inline intvec<double, 8> intvec<double, 8>::bitifthen(intvec_t x,
+                                                      intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline boolvec<double, 8> intvec<double, 8>::isignbit() const {
+  return MF::vml_isignbit(*this);
+}
+
+inline intvec<double, 8> intvec<double, 8>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 8> intvec<double, 8>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 8> intvec<double, 8>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 8> intvec<double, 8>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_MIC_DOUBLE8_H
+#endif // #ifndef VEC_MIC_DOUBLE8_H
diff --git a/vec_neon_float2.h b/vec_neon_float2.h
index 3a21a05..6df9969 100644
--- a/vec_neon_float2.h
+++ b/vec_neon_float2.h
@@ -14,608 +14,511 @@
 // Neon intrinsics
 #include <arm_neon.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FLOAT_2
-  template<> struct boolvec<float,2>;
-  template<> struct intvec<float,2>;
-  template<> struct realvec<float,2>;
-  
-  
-  
-  template<>
-  struct boolvec<float,2>: floatprops<float>
-  {
-    static int const size = 2;
-    typedef bool scalar_t;
-    typedef uint32x2_t bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values are -1, false values are 0
-    static uint_t from_bool(bool a) { return -int_t(a); }
-    static bool to_bool(uint_t a) { return a; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(vdup_n_u32(from_bool(a))) {}
-    boolvec(bool const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vmvn_u32(v); }
-    
-    boolvec operator&&(boolvec x) const { return vand_u32(v, x.v); }
-    boolvec operator||(boolvec x) const { return vorr_u32(v, x.v); }
-    boolvec operator==(boolvec x) const { return vceq_u32(v, x.v); }
-    boolvec operator!=(boolvec x) const { return veor_u32(v, x.v); }
-    
-    bool all() const
-    {
-      boolvec r = vpmin_u32(v, v);
-      return r[0];
-    }
-    bool any() const
-    {
-      boolvec r = vpmax_u32(v, v);
-      return r[0];
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,2>: floatprops<float>
-  {
-    static int const size = 2;
-    typedef int_t scalar_t;
-    typedef int32x2_t ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vdup_n_s32(a)) {}
-    intvec(int_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota()
-    {
-      return vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0));
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return vreinterpret_u32_s32(v); }
-    boolvec_t convert_bool() const { return *this != IV(0); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return vneg_s32(v); }
-    
-    intvec operator+(intvec x) const { return vadd_s32(v, x.v); }
-    intvec operator-(intvec x) const { return vsub_s32(v, x.v); }
-    intvec operator*(intvec x) const { return vmul_s32(v, x.v); }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    intvec& operator*=(intvec const& x) { return *this=*this*x; }
-    
-    
-    
-    intvec operator~() const { return vmvn_s32(v); }
-    
-    intvec operator&(intvec x) const { return vand_s32(v, x.v); }
-    intvec operator|(intvec x) const { return vorr_s32(v, x.v); }
-    intvec operator^(intvec x) const { return veor_s32(v, x.v); }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const
-    {
-      return vbsl_s32(vreinterpret_u32_s32(v), x.v, y.v);
-    }
-    
-    
-    
-    intvec_t lsr(int_t n) const { return lsr(IV(n)); }
-    intvec_t rotate(int_t n) const;
-    intvec operator>>(int_t n) const { return *this >> IV(n); }
-    intvec operator<<(int_t n) const { return *this << IV(n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      return vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v), (-n).v));
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec operator>>(intvec n) const
-    {
-      return vshl_s32(v, (-n).v);
-    }
-    intvec operator<<(intvec n) const
-    {
-      return vshl_s32(v, n.v);
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    intvec_t clz() const { return vclz_s32(v); }
-    intvec_t popcount() const
-    {
-      return vpaddl_s16(vpaddl_s8(vcnt_s8(vreinterpret_s8_s32(v))));
-    }
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const { return vceq_s32(v, x.v); }
-    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(intvec const& x) const { return vclt_s32(v, x.v); }
-    boolvec_t operator<=(intvec const& x) const { return vcle_s32(v, x.v); }
-    boolvec_t operator>(intvec const& x) const { return vcgt_s32(v, x.v); }
-    boolvec_t operator>=(intvec const& x) const { return vcge_s32(v, x.v); }
-    
-    intvec_t abs() const { return vabs_s32(v); }
-    boolvec_t isignbit() const
-    {
-      //return *this < IV(I(0));
-      return intvec(vshr_n_s32(v, FP::bits-1)).as_bool();
-    }
-    intvec_t max(intvec_t x) const { return vmax_s32(v, x.v); }
-    intvec_t min(intvec_t x) const { return vmin_s32(v, x.v); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,2>: floatprops<float>
-  {
-    static int const size = 2;
-    typedef real_t scalar_t;
-    typedef float32x2_t vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<NEON:2*float>"; }
-    void barrier() { __asm__("": "+w"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vdup_n_f32(a)) {}
-    realvec(real_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vld1_f32(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
+template <> struct boolvec<float, 2>;
+template <> struct intvec<float, 2>;
+template <> struct realvec<float, 2>;
+
+template <> struct boolvec<float, 2> : floatprops<float> {
+  static int const size = 2;
+  typedef bool scalar_t;
+  typedef uint32x2_t bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values are -1, false values are 0
+  static uint_t from_bool(bool a) { return -int_t(a); }
+  static bool to_bool(uint_t a) { return a; }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(vdup_n_u32(from_bool(a))) {}
+  boolvec(bool const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return vmvn_u32(v); }
+
+  boolvec operator&&(boolvec x) const { return vand_u32(v, x.v); }
+  boolvec operator||(boolvec x) const { return vorr_u32(v, x.v); }
+  boolvec operator==(boolvec x) const { return vceq_u32(v, x.v); }
+  boolvec operator!=(boolvec x) const { return veor_u32(v, x.v); }
+
+  bool all() const {
+    boolvec r = vpmin_u32(v, v);
+    return r[0];
+  }
+  bool any() const {
+    boolvec r = vpmax_u32(v, v);
+    return r[0];
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 2> : floatprops<float> {
+  static int const size = 2;
+  typedef int_t scalar_t;
+  typedef int32x2_t ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(vdup_n_s32(a)) {}
+  intvec(int_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+  static intvec iota() {
+    return vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0));
+  }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  // Vector casts do not change the bit battern
+  boolvec_t as_bool() const { return vreinterpret_u32_s32(v); }
+  boolvec_t convert_bool() const { return *this != IV(0); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const { return vneg_s32(v); }
+
+  intvec operator+(intvec x) const { return vadd_s32(v, x.v); }
+  intvec operator-(intvec x) const { return vsub_s32(v, x.v); }
+  intvec operator*(intvec x) const { return vmul_s32(v, x.v); }
+
+  intvec &operator+=(intvec const &x) { return *this = *this + x; }
+  intvec &operator-=(intvec const &x) { return *this = *this - x; }
+  intvec &operator*=(intvec const &x) { return *this = *this * x; }
+
+  intvec operator~() const { return vmvn_s32(v); }
+
+  intvec operator&(intvec x) const { return vand_s32(v, x.v); }
+  intvec operator|(intvec x) const { return vorr_s32(v, x.v); }
+  intvec operator^(intvec x) const { return veor_s32(v, x.v); }
+
+  intvec &operator&=(intvec const &x) { return *this = *this & x; }
+  intvec &operator|=(intvec const &x) { return *this = *this | x; }
+  intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const {
+    return vbsl_s32(vreinterpret_u32_s32(v), x.v, y.v);
+  }
+
+  intvec_t lsr(int_t n) const { return lsr(IV(n)); }
+  intvec_t rotate(int_t n) const;
+  intvec operator>>(int_t n) const { return *this >> IV(n); }
+  intvec operator<<(int_t n) const { return *this << IV(n); }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec lsr(intvec n) const {
+    return vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v), (-n).v));
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec operator>>(intvec n) const { return vshl_s32(v, (-n).v); }
+  intvec operator<<(intvec n) const { return vshl_s32(v, n.v); }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  intvec_t clz() const { return vclz_s32(v); }
+  intvec_t popcount() const {
+    return vpaddl_s16(vpaddl_s8(vcnt_s8(vreinterpret_s8_s32(v))));
+  }
+
+  boolvec_t operator==(intvec const &x) const { return vceq_s32(v, x.v); }
+  boolvec_t operator!=(intvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(intvec const &x) const { return vclt_s32(v, x.v); }
+  boolvec_t operator<=(intvec const &x) const { return vcle_s32(v, x.v); }
+  boolvec_t operator>(intvec const &x) const { return vcgt_s32(v, x.v); }
+  boolvec_t operator>=(intvec const &x) const { return vcge_s32(v, x.v); }
+
+  intvec_t abs() const { return vabs_s32(v); }
+  boolvec_t isignbit() const {
+    // return *this < IV(I(0));
+    return intvec(vshr_n_s32(v, FP::bits - 1)).as_bool();
+  }
+  intvec_t max(intvec_t x) const { return vmax_s32(v, x.v); }
+  intvec_t min(intvec_t x) const { return vmin_s32(v, x.v); }
+};
+
+template <> struct realvec<float, 2> : floatprops<float> {
+  static int const size = 2;
+  typedef real_t scalar_t;
+  typedef float32x2_t vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<NEON:2*float>"; }
+  void barrier() { __asm__("" : "+w"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(vdup_n_f32(a)) {}
+  realvec(real_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return vld1_f32(p);
+  }
+  static realvec_t loadu(real_t const *p) {
 #if defined __ARM_FEATURE_UNALIGNED
-      return vld1_f32(p);
+    return vld1_f32(p);
 #else
-      realvec_t r;
-      r.set_elt(0, p[0]);
-      r.set_elt(1, p[1]);
-      return r;
+    realvec_t r;
+    r.set_elt(0, p[0]);
+    r.set_elt(1, p[1]);
+    return r;
 #endif
+  }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vst1_f32(p, v);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    vst1_f32(p, v);
+  }
+  void storeu(real_t *p) const {
+// Vector stores would require vector loads, which would need to
+// be atomic
 #if defined __ARM_FEATURE_UNALIGNED
-      vst1_f32(p, v);
+    vst1_f32(p, v);
 #else
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
+    p[0] = (*this)[0];
+    p[1] = (*this)[1];
 #endif
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return vreinterpret_s32_f32(v); }
-    intvec_t convert_int() const { return vcvt_s32_f32(v); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return vneg_f32(v); }
-    
-    realvec operator+(realvec x) const { return vadd_f32(v, x.v); }
-    realvec operator-(realvec x) const { return vsub_f32(v, x.v); }
-    realvec operator*(realvec x) const { return vmul_f32(v, x.v); }
-    realvec operator/(realvec x) const { return *this * x.rcp(); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      realvec r = vpmax_f32(v, v);
-      return r[0];
-    }
-    real_t minval() const
-    {
-      realvec r = vpmin_f32(v, v);
-      return r[0];
-    }
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1];
-    }
-    real_t sum() const
-    {
-      realvec r = vpadd_f32(v, v);
-      return r[0];
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return vceq_f32(v, x.v); }
-    boolvec_t operator!=(realvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(realvec const& x) const { return vclt_f32(v, x.v); }
-    boolvec_t operator<=(realvec const& x) const { return vcle_f32(v, x.v); }
-    boolvec_t operator>(realvec const& x) const { return vcgt_f32(v, x.v); }
-    boolvec_t operator>=(realvec const& x) const { return vcge_f32(v, x.v); }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const
-    {
-      // return vrndp_f32(v);
-      return MF::vml_ceil(*this);
-    }
-    realvec copysign(realvec y) const
-    {
-      return vbsl_f32(vdup_n_u32(FP::signbit_mask), y.v, v);
-    }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return vabs_f32(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const
-    {
-      // return vrndm_f32(v);
-      return MF::vml_floor(*this);
-    }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return vfma_f32(z.v, v, y.v);
-    }
-    realvec fmax(realvec y) const { return vmax_f32(v, y.v); }
-    realvec fmin(realvec y) const { return vmin_f32(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      // TODO: vfma_f32
-      return vmla_f32(z.v, v, y.v);
-    }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec r = vrecpe_f32(v);
-      r *= vrecps_f32(v, r);
-      r *= vrecps_f32(v, r);
-      return r;
-    }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const
-    {
-      // return vrndn_f32(v);
-      return MF::vml_rint(*this);
-    }
-    realvec round() const
-    {
-      // return vrnda_f32(v);
-      return MF::vml_round(*this);
-    }
-    realvec rsqrt() const
-    {
-      realvec r = vrsqrte_f32(v);
-      r *= vrsqrts_f32(v, r*r);
-      r *= vrsqrts_f32(v, r*r);
-      return r;
-    }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return *this * rsqrt(); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const
-    {
-      // return vrnd_f32(v);
-      return MF::vml_trunc(*this);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
     }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<float,2> boolvec<float,2>::as_int() const
-  {
-    return vreinterpret_s32_u32(v);
-  }
-  
-  inline intvec<float,2> boolvec<float,2>::convert_int() const
-  {
-    return - as_int();
-  }
-  
-  inline
-  boolvec<float,2> boolvec<float,2>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return vbsl_u32(v, x.v, y.v);
-  }
-  
-  inline intvec<float,2> boolvec<float,2>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return vbsl_s32(v, x.v, y.v);
-  }
-  
-  inline
-  realvec<float,2> boolvec<float,2>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return vbsl_f32(v, x.v, y.v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<float,2> intvec<float,2>::as_float() const
-  {
-    return vreinterpret_f32_s32(v);
-  }
-  
-  inline realvec<float,2> intvec<float,2>::convert_float() const
-  {
-    return vcvt_f32_s32(v);
-  }
-  
-  inline intvec<float,2> intvec<float,2>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<float,2> intvec<float,2>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return vreinterpret_s32_f32(v); }
+  intvec_t convert_int() const { return vcvt_s32_f32(v); }
+
+  realvec operator+() const { return *this; }
+  realvec operator-() const { return vneg_f32(v); }
+
+  realvec operator+(realvec x) const { return vadd_f32(v, x.v); }
+  realvec operator-(realvec x) const { return vsub_f32(v, x.v); }
+  realvec operator*(realvec x) const { return vmul_f32(v, x.v); }
+  realvec operator/(realvec x) const { return *this * x.rcp(); }
+
+  realvec &operator+=(realvec const &x) { return *this = *this + x; }
+  realvec &operator-=(realvec const &x) { return *this = *this - x; }
+  realvec &operator*=(realvec const &x) { return *this = *this * x; }
+  realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    realvec r = vpmax_f32(v, v);
+    return r[0];
+  }
+  real_t minval() const {
+    realvec r = vpmin_f32(v, v);
+    return r[0];
+  }
+  real_t prod() const { return (*this)[0] * (*this)[1]; }
+  real_t sum() const {
+    realvec r = vpadd_f32(v, v);
+    return r[0];
+  }
+
+  boolvec_t operator==(realvec const &x) const { return vceq_f32(v, x.v); }
+  boolvec_t operator!=(realvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(realvec const &x) const { return vclt_f32(v, x.v); }
+  boolvec_t operator<=(realvec const &x) const { return vcle_f32(v, x.v); }
+  boolvec_t operator>(realvec const &x) const { return vcgt_f32(v, x.v); }
+  boolvec_t operator>=(realvec const &x) const { return vcge_f32(v, x.v); }
+
+  realvec acos() const { return MF::vml_acos(*this); }
+  realvec acosh() const { return MF::vml_acosh(*this); }
+  realvec asin() const { return MF::vml_asin(*this); }
+  realvec asinh() const { return MF::vml_asinh(*this); }
+  realvec atan() const { return MF::vml_atan(*this); }
+  realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+  realvec atanh() const { return MF::vml_atanh(*this); }
+  realvec cbrt() const { return MF::vml_cbrt(*this); }
+  realvec ceil() const {
+    // return vrndp_f32(v);
+    return MF::vml_ceil(*this);
+  }
+  realvec copysign(realvec y) const {
+    return vbsl_f32(vdup_n_u32(FP::signbit_mask), y.v, v);
+  }
+  realvec cos() const { return MF::vml_cos(*this); }
+  realvec cosh() const { return MF::vml_cosh(*this); }
+  realvec exp() const { return MF::vml_exp(*this); }
+  realvec exp10() const { return MF::vml_exp10(*this); }
+  realvec exp2() const { return MF::vml_exp2(*this); }
+  realvec expm1() const { return MF::vml_expm1(*this); }
+  realvec fabs() const { return vabs_f32(v); }
+  realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+  realvec floor() const {
+    // return vrndm_f32(v);
+    return MF::vml_floor(*this);
+  }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return vfma_f32(z.v, v, y.v);
+  }
+  realvec fmax(realvec y) const { return vmax_f32(v, y.v); }
+  realvec fmin(realvec y) const { return vmin_f32(v, y.v); }
+  realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+  realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec log() const { return MF::vml_log(*this); }
+  realvec log10() const { return MF::vml_log10(*this); }
+  realvec log1p() const { return MF::vml_log1p(*this); }
+  realvec log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    // TODO: vfma_f32
+    return vmla_f32(z.v, v, y.v);
+  }
+  realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+  realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+  realvec rcp() const {
+    realvec r = vrecpe_f32(v);
+    r *= vrecps_f32(v, r);
+    r *= vrecps_f32(v, r);
+    return r;
+  }
+  realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+  realvec rint() const {
+    // return vrndn_f32(v);
+    return MF::vml_rint(*this);
+  }
+  realvec round() const {
+    // return vrnda_f32(v);
+    return MF::vml_round(*this);
+  }
+  realvec rsqrt() const {
+    realvec r = vrsqrte_f32(v);
+    r *= vrsqrts_f32(v, r * r);
+    r *= vrsqrts_f32(v, r * r);
+    return r;
+  }
+  boolvec_t signbit() const { return MF::vml_signbit(*this); }
+  realvec sin() const { return MF::vml_sin(*this); }
+  realvec sinh() const { return MF::vml_sinh(*this); }
+  realvec sqrt() const { return *this * rsqrt(); }
+  realvec tan() const { return MF::vml_tan(*this); }
+  realvec tanh() const { return MF::vml_tanh(*this); }
+  realvec trunc() const {
+    // return vrnd_f32(v);
+    return MF::vml_trunc(*this);
+  }
+};
+
+// boolvec definitions
+
+inline intvec<float, 2> boolvec<float, 2>::as_int() const {
+  return vreinterpret_s32_u32(v);
+}
+
+inline intvec<float, 2> boolvec<float, 2>::convert_int() const {
+  return -as_int();
+}
+
+inline boolvec<float, 2> boolvec<float, 2>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return vbsl_u32(v, x.v, y.v);
+}
+
+inline intvec<float, 2> boolvec<float, 2>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return vbsl_s32(v, x.v, y.v);
+}
+
+inline realvec<float, 2> boolvec<float, 2>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
+  return vbsl_f32(v, x.v, y.v);
+}
+
+// intvec definitions
+
+inline realvec<float, 2> intvec<float, 2>::as_float() const {
+  return vreinterpret_f32_s32(v);
+}
+
+inline realvec<float, 2> intvec<float, 2>::convert_float() const {
+  return vcvt_f32_s32(v);
+}
+
+inline intvec<float, 2> intvec<float, 2>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 2> intvec<float, 2>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_NEON_FLOAT2_H
+#endif // #ifndef VEC_NEON_FLOAT2_H
diff --git a/vec_neon_float4.h b/vec_neon_float4.h
index 2bd9dda..9ec1e79 100644
--- a/vec_neon_float4.h
+++ b/vec_neon_float4.h
@@ -14,628 +14,537 @@
 // Neon intrinsics
 #include <arm_neon.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FLOAT_4
-  template<> struct boolvec<float,4>;
-  template<> struct intvec<float,4>;
-  template<> struct realvec<float,4>;
-  
-  
-  
-  template<>
-  struct boolvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef uint32x4_t bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values are -1, false values are 0
-    static uint_t from_bool(bool a) { return -int_t(a); }
-    static bool to_bool(uint_t a) { return a; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(vdupq_n_u32(from_bool(a))) {}
-    boolvec(bool const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vmvnq_u32(v); }
-    
-    boolvec operator&&(boolvec x) const { return vandq_u32(v, x.v); }
-    boolvec operator||(boolvec x) const { return vorrq_u32(v, x.v); }
-    boolvec operator==(boolvec x) const { return vceqq_u32(v, x.v); }
-    boolvec operator!=(boolvec x) const { return veorq_u32(v, x.v); }
-    
-    bool all() const
-    {
-      uint32x2_t x = vpmin_u32(vget_low_u32(v), vget_high_u32(v));
-      uint32x2_t y = vpmin_u32(x, x);
-      uint32_t z = vget_lane_u32(y, 0);
-      return to_bool(z);
-    }
-    bool any() const
-    {
-      uint32x2_t x = vpmax_u32(vget_low_u32(v), vget_high_u32(v));
-      uint32x2_t y = vpmax_u32(x, x);
-      uint32_t z = vget_lane_u32(y, 0);
-      return to_bool(z);
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef int32x4_t ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vdupq_n_s32(a)) {}
-    intvec(int_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota()
-    {
-      return
-        vcombine_s32(vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0)),
-                     vcreate_s32((uint64_t(3) << uint64_t(32)) | uint64_t(2)));
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return vreinterpretq_u32_s32(v); }
-    boolvec_t convert_bool() const { return *this != IV(0); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return vnegq_s32(v); }
-    
-    intvec operator+(intvec x) const { return vaddq_s32(v, x.v); }
-    intvec operator-(intvec x) const { return vsubq_s32(v, x.v); }
-    intvec operator*(intvec x) const { return vmulq_s32(v, x.v); }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    intvec& operator*=(intvec const& x) { return *this=*this*x; }
-    
-    
-    
-    intvec operator~() const { return vmvnq_s32(v); }
-    
-    intvec operator&(intvec x) const { return vandq_s32(v, x.v); }
-    intvec operator|(intvec x) const { return vorrq_s32(v, x.v); }
-    intvec operator^(intvec x) const { return veorq_s32(v, x.v); }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const
-    {
-      return vbslq_s32(vreinterpretq_u32_s32(v), x.v, y.v);
-    }
-    
-    
-    
-    intvec_t lsr(int_t n) const { return lsr(IV(n)); }
-    intvec_t rotate(int_t n) const;
-    intvec operator>>(int_t n) const { return *this >> IV(n); }
-    intvec operator<<(int_t n) const { return *this << IV(n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v), (-n).v));
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec operator>>(intvec n) const
-    {
-      return vshlq_s32(v, (-n).v);
-    }
-    intvec operator<<(intvec n) const
-    {
-      return vshlq_s32(v, n.v);
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    intvec_t clz() const { return vclzq_s32(v); }
-    intvec_t popcount() const
-    {
-      return vpaddlq_s16(vpaddlq_s8(vcntq_s8(vreinterpretq_s8_s32(v))));
-    }
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const { return vceqq_s32(v, x.v); }
-    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(intvec const& x) const { return vcltq_s32(v, x.v); }
-    boolvec_t operator<=(intvec const& x) const { return vcleq_s32(v, x.v); }
-    boolvec_t operator>(intvec const& x) const { return vcgtq_s32(v, x.v); }
-    boolvec_t operator>=(intvec const& x) const { return vcgeq_s32(v, x.v); }
-    
-    intvec_t abs() const { return vabsq_s32(v); }
-    boolvec_t isignbit() const
-    {
-      //return *this < IV(I(0));
-      return intvec(vshrq_n_s32(v, FP::bits-1)).as_bool();
-    }
-    intvec_t max(intvec_t x) const { return vmaxq_s32(v, x.v); }
-    intvec_t min(intvec_t x) const { return vminq_s32(v, x.v); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef float32x4_t vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<NEON:4*float>"; }
-    void barrier() { __asm__("": "+w"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vdupq_n_f32(a)) {}
-    realvec(real_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vld1q_f32(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
+template <> struct boolvec<float, 4>;
+template <> struct intvec<float, 4>;
+template <> struct realvec<float, 4>;
+
+template <> struct boolvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef bool scalar_t;
+  typedef uint32x4_t bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values are -1, false values are 0
+  static uint_t from_bool(bool a) { return -int_t(a); }
+  static bool to_bool(uint_t a) { return a; }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(vdupq_n_u32(from_bool(a))) {}
+  boolvec(bool const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return vmvnq_u32(v); }
+
+  boolvec operator&&(boolvec x) const { return vandq_u32(v, x.v); }
+  boolvec operator||(boolvec x) const { return vorrq_u32(v, x.v); }
+  boolvec operator==(boolvec x) const { return vceqq_u32(v, x.v); }
+  boolvec operator!=(boolvec x) const { return veorq_u32(v, x.v); }
+
+  bool all() const {
+    uint32x2_t x = vpmin_u32(vget_low_u32(v), vget_high_u32(v));
+    uint32x2_t y = vpmin_u32(x, x);
+    uint32_t z = vget_lane_u32(y, 0);
+    return to_bool(z);
+  }
+  bool any() const {
+    uint32x2_t x = vpmax_u32(vget_low_u32(v), vget_high_u32(v));
+    uint32x2_t y = vpmax_u32(x, x);
+    uint32_t z = vget_lane_u32(y, 0);
+    return to_bool(z);
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef int_t scalar_t;
+  typedef int32x4_t ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(vdupq_n_s32(a)) {}
+  intvec(int_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+  static intvec iota() {
+    return vcombine_s32(
+        vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0)),
+        vcreate_s32((uint64_t(3) << uint64_t(32)) | uint64_t(2)));
+  }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  // Vector casts do not change the bit battern
+  boolvec_t as_bool() const { return vreinterpretq_u32_s32(v); }
+  boolvec_t convert_bool() const { return *this != IV(0); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const { return vnegq_s32(v); }
+
+  intvec operator+(intvec x) const { return vaddq_s32(v, x.v); }
+  intvec operator-(intvec x) const { return vsubq_s32(v, x.v); }
+  intvec operator*(intvec x) const { return vmulq_s32(v, x.v); }
+
+  intvec &operator+=(intvec const &x) { return *this = *this + x; }
+  intvec &operator-=(intvec const &x) { return *this = *this - x; }
+  intvec &operator*=(intvec const &x) { return *this = *this * x; }
+
+  intvec operator~() const { return vmvnq_s32(v); }
+
+  intvec operator&(intvec x) const { return vandq_s32(v, x.v); }
+  intvec operator|(intvec x) const { return vorrq_s32(v, x.v); }
+  intvec operator^(intvec x) const { return veorq_s32(v, x.v); }
+
+  intvec &operator&=(intvec const &x) { return *this = *this & x; }
+  intvec &operator|=(intvec const &x) { return *this = *this | x; }
+  intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const {
+    return vbslq_s32(vreinterpretq_u32_s32(v), x.v, y.v);
+  }
+
+  intvec_t lsr(int_t n) const { return lsr(IV(n)); }
+  intvec_t rotate(int_t n) const;
+  intvec operator>>(int_t n) const { return *this >> IV(n); }
+  intvec operator<<(int_t n) const { return *this << IV(n); }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+    return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v), (-n).v));
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec operator>>(intvec n) const { return vshlq_s32(v, (-n).v); }
+  intvec operator<<(intvec n) const { return vshlq_s32(v, n.v); }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  intvec_t clz() const { return vclzq_s32(v); }
+  intvec_t popcount() const {
+    return vpaddlq_s16(vpaddlq_s8(vcntq_s8(vreinterpretq_s8_s32(v))));
+  }
+
+  boolvec_t operator==(intvec const &x) const { return vceqq_s32(v, x.v); }
+  boolvec_t operator!=(intvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(intvec const &x) const { return vcltq_s32(v, x.v); }
+  boolvec_t operator<=(intvec const &x) const { return vcleq_s32(v, x.v); }
+  boolvec_t operator>(intvec const &x) const { return vcgtq_s32(v, x.v); }
+  boolvec_t operator>=(intvec const &x) const { return vcgeq_s32(v, x.v); }
+
+  intvec_t abs() const { return vabsq_s32(v); }
+  boolvec_t isignbit() const {
+    // return *this < IV(I(0));
+    return intvec(vshrq_n_s32(v, FP::bits - 1)).as_bool();
+  }
+  intvec_t max(intvec_t x) const { return vmaxq_s32(v, x.v); }
+  intvec_t min(intvec_t x) const { return vminq_s32(v, x.v); }
+};
+
+template <> struct realvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef real_t scalar_t;
+  typedef float32x4_t vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<NEON:4*float>"; }
+  void barrier() { __asm__("" : "+w"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(vdupq_n_f32(a)) {}
+  realvec(real_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return vld1q_f32(p);
+  }
+  static realvec_t loadu(real_t const *p) {
 #if defined __ARM_FEATURE_UNALIGNED
-      return vld1q_f32(p);
+    return vld1q_f32(p);
 #else
-      realvec_t r;
-      r.set_elt(0, p[0]);
-      r.set_elt(1, p[1]);
-      r.set_elt(2, p[2]);
-      r.set_elt(3, p[3]);
-      return r;
+    realvec_t r;
+    r.set_elt(0, p[0]);
+    r.set_elt(1, p[1]);
+    r.set_elt(2, p[2]);
+    r.set_elt(3, p[3]);
+    return r;
 #endif
+  }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vst1q_f32(p, v);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    vst1q_f32(p, v);
+  }
+  void storeu(real_t *p) const {
+// Vector stores would require vector loads, which would need to
+// be atomic
 #if defined __ARM_FEATURE_UNALIGNED
-      vst1q_f32(p, v);
+    vst1q_f32(p, v);
 #else
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
-      p[2] = (*this)[2];
-      p[3] = (*this)[3];
+    p[0] = (*this)[0];
+    p[1] = (*this)[1];
+    p[2] = (*this)[2];
+    p[3] = (*this)[3];
 #endif
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return vreinterpretq_s32_f32(v); }
-    intvec_t convert_int() const { return vcvtq_s32_f32(v); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return vnegq_f32(v); }
-    
-    realvec operator+(realvec x) const { return vaddq_f32(v, x.v); }
-    realvec operator-(realvec x) const { return vsubq_f32(v, x.v); }
-    realvec operator*(realvec x) const { return vmulq_f32(v, x.v); }
-    realvec operator/(realvec x) const { return *this * x.rcp(); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      float32x2_t x = vpmax_f32(vget_low_f32(v), vget_high_f32(v));
-      float32x2_t y = vpmax_f32(x, x);
-      float32_t z = vget_lane_f32(y, 0);
-      return z;
-    }
-    real_t minval() const
-    {
-      float32x2_t x = vpmin_f32(vget_low_f32(v), vget_high_f32(v));
-      float32x2_t y = vpmin_f32(x, x);
-      float32_t z = vget_lane_f32(y, 0);
-      return z;
-    }
-    real_t prod() const
-    {
-      // TODO: multiply pairwise with 2-vectors
-      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-    }
-    real_t sum() const
-    {
-      float32x2_t x = vpadd_f32(vget_low_f32(v), vget_high_f32(v));
-      float32x2_t y = vpadd_f32(x, x);
-      float32_t z = vget_lane_f32(y, 0);
-      return z;
-    }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return vceqq_f32(v, x.v); }
-    boolvec_t operator!=(realvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(realvec const& x) const { return vcltq_f32(v, x.v); }
-    boolvec_t operator<=(realvec const& x) const { return vcleq_f32(v, x.v); }
-    boolvec_t operator>(realvec const& x) const { return vcgtq_f32(v, x.v); }
-    boolvec_t operator>=(realvec const& x) const { return vcgeq_f32(v, x.v); }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const
-    {
-      // return vrndpq_f32(v);
-      return MF::vml_ceil(*this);
-    }
-    realvec copysign(realvec y) const
-    {
-      return vbslq_f32(vdupq_n_u32(FP::signbit_mask), y.v, v);
-    }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return vabsq_f32(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const
-    {
-      // return vrndmq_f32(v);
-      return MF::vml_floor(*this);
-    }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return vfmaq_f32(z.v, v, y.v);
-    }
-    realvec fmax(realvec y) const { return vmaxq_f32(v, y.v); }
-    realvec fmin(realvec y) const { return vminq_f32(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return vmlaq_f32(z.v, v, y.v);
-    }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec r = vrecpeq_f32(v);
-      r *= vrecpsq_f32(v, r);
-      r *= vrecpsq_f32(v, r);
-      return r;
-    }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const
-    {
-      // return vrndnq_f32(v);
-      return MF::vml_rint(*this);
-    }
-    realvec round() const
-    {
-      // return vrndaq_f32(v);
-      return MF::vml_round(*this);
-    }
-    realvec rsqrt() const
-    {
-      realvec r = vrsqrteq_f32(v);
-      r *= vrsqrtsq_f32(v, r*r);
-      r *= vrsqrtsq_f32(v, r*r);
-      return r;
-    }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return *this * rsqrt(); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const
-    {
-      // return vrndq_f32(v);
-      return MF::vml_trunc(*this);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<float,4> boolvec<float,4>::as_int() const
-  {
-    return vreinterpretq_s32_u32(v);
-  }
-  
-  inline intvec<float,4> boolvec<float,4>::convert_int() const
-  {
-    return - as_int();
-  }
-  
-  inline
-  boolvec<float,4> boolvec<float,4>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return vbslq_u32(v, x.v, y.v);
-  }
-  
-  inline intvec<float,4> boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return vbslq_s32(v, x.v, y.v);
-  }
-  
-  inline
-  realvec<float,4> boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return vbslq_f32(v, x.v, y.v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<float,4> intvec<float,4>::as_float() const
-  {
-    return vreinterpretq_f32_s32(v);
-  }
-  
-  inline realvec<float,4> intvec<float,4>::convert_float() const
-  {
-    return vcvtq_f32_s32(v);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return vreinterpretq_s32_f32(v); }
+  intvec_t convert_int() const { return vcvtq_s32_f32(v); }
+
+  realvec operator+() const { return *this; }
+  realvec operator-() const { return vnegq_f32(v); }
+
+  realvec operator+(realvec x) const { return vaddq_f32(v, x.v); }
+  realvec operator-(realvec x) const { return vsubq_f32(v, x.v); }
+  realvec operator*(realvec x) const { return vmulq_f32(v, x.v); }
+  realvec operator/(realvec x) const { return *this * x.rcp(); }
+
+  realvec &operator+=(realvec const &x) { return *this = *this + x; }
+  realvec &operator-=(realvec const &x) { return *this = *this - x; }
+  realvec &operator*=(realvec const &x) { return *this = *this * x; }
+  realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    float32x2_t x = vpmax_f32(vget_low_f32(v), vget_high_f32(v));
+    float32x2_t y = vpmax_f32(x, x);
+    float32_t z = vget_lane_f32(y, 0);
+    return z;
+  }
+  real_t minval() const {
+    float32x2_t x = vpmin_f32(vget_low_f32(v), vget_high_f32(v));
+    float32x2_t y = vpmin_f32(x, x);
+    float32_t z = vget_lane_f32(y, 0);
+    return z;
+  }
+  real_t prod() const {
+    // TODO: multiply pairwise with 2-vectors
+    return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+  }
+  real_t sum() const {
+    float32x2_t x = vpadd_f32(vget_low_f32(v), vget_high_f32(v));
+    float32x2_t y = vpadd_f32(x, x);
+    float32_t z = vget_lane_f32(y, 0);
+    return z;
+  }
+
+  boolvec_t operator==(realvec const &x) const { return vceqq_f32(v, x.v); }
+  boolvec_t operator!=(realvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(realvec const &x) const { return vcltq_f32(v, x.v); }
+  boolvec_t operator<=(realvec const &x) const { return vcleq_f32(v, x.v); }
+  boolvec_t operator>(realvec const &x) const { return vcgtq_f32(v, x.v); }
+  boolvec_t operator>=(realvec const &x) const { return vcgeq_f32(v, x.v); }
+
+  realvec acos() const { return MF::vml_acos(*this); }
+  realvec acosh() const { return MF::vml_acosh(*this); }
+  realvec asin() const { return MF::vml_asin(*this); }
+  realvec asinh() const { return MF::vml_asinh(*this); }
+  realvec atan() const { return MF::vml_atan(*this); }
+  realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+  realvec atanh() const { return MF::vml_atanh(*this); }
+  realvec cbrt() const { return MF::vml_cbrt(*this); }
+  realvec ceil() const {
+    // return vrndpq_f32(v);
+    return MF::vml_ceil(*this);
+  }
+  realvec copysign(realvec y) const {
+    return vbslq_f32(vdupq_n_u32(FP::signbit_mask), y.v, v);
+  }
+  realvec cos() const { return MF::vml_cos(*this); }
+  realvec cosh() const { return MF::vml_cosh(*this); }
+  realvec exp() const { return MF::vml_exp(*this); }
+  realvec exp10() const { return MF::vml_exp10(*this); }
+  realvec exp2() const { return MF::vml_exp2(*this); }
+  realvec expm1() const { return MF::vml_expm1(*this); }
+  realvec fabs() const { return vabsq_f32(v); }
+  realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+  realvec floor() const {
+    // return vrndmq_f32(v);
+    return MF::vml_floor(*this);
+  }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return vfmaq_f32(z.v, v, y.v);
+  }
+  realvec fmax(realvec y) const { return vmaxq_f32(v, y.v); }
+  realvec fmin(realvec y) const { return vminq_f32(v, y.v); }
+  realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+  realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec log() const { return MF::vml_log(*this); }
+  realvec log10() const { return MF::vml_log10(*this); }
+  realvec log1p() const { return MF::vml_log1p(*this); }
+  realvec log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return vmlaq_f32(z.v, v, y.v);
+  }
+  realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+  realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+  realvec rcp() const {
+    realvec r = vrecpeq_f32(v);
+    r *= vrecpsq_f32(v, r);
+    r *= vrecpsq_f32(v, r);
+    return r;
+  }
+  realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+  realvec rint() const {
+    // return vrndnq_f32(v);
+    return MF::vml_rint(*this);
+  }
+  realvec round() const {
+    // return vrndaq_f32(v);
+    return MF::vml_round(*this);
+  }
+  realvec rsqrt() const {
+    realvec r = vrsqrteq_f32(v);
+    r *= vrsqrtsq_f32(v, r * r);
+    r *= vrsqrtsq_f32(v, r * r);
+    return r;
+  }
+  boolvec_t signbit() const { return MF::vml_signbit(*this); }
+  realvec sin() const { return MF::vml_sin(*this); }
+  realvec sinh() const { return MF::vml_sinh(*this); }
+  realvec sqrt() const { return *this * rsqrt(); }
+  realvec tan() const { return MF::vml_tan(*this); }
+  realvec tanh() const { return MF::vml_tanh(*this); }
+  realvec trunc() const {
+    // return vrndq_f32(v);
+    return MF::vml_trunc(*this);
+  }
+};
+
+// boolvec definitions
+
+inline intvec<float, 4> boolvec<float, 4>::as_int() const {
+  return vreinterpretq_s32_u32(v);
+}
+
+inline intvec<float, 4> boolvec<float, 4>::convert_int() const {
+  return -as_int();
+}
+
+inline boolvec<float, 4> boolvec<float, 4>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return vbslq_u32(v, x.v, y.v);
+}
+
+inline intvec<float, 4> boolvec<float, 4>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return vbslq_s32(v, x.v, y.v);
+}
+
+inline realvec<float, 4> boolvec<float, 4>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
+  return vbslq_f32(v, x.v, y.v);
+}
+
+// intvec definitions
+
+inline realvec<float, 4> intvec<float, 4>::as_float() const {
+  return vreinterpretq_f32_s32(v);
+}
+
+inline realvec<float, 4> intvec<float, 4>::convert_float() const {
+  return vcvtq_f32_s32(v);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_NEON_FLOAT4_H
+#endif // #ifndef VEC_NEON_FLOAT4_H
diff --git a/vec_pseudo.h b/vec_pseudo.h
index 2aafc23..c4cbbc1 100644
--- a/vec_pseudo.h
+++ b/vec_pseudo.h
@@ -12,1668 +12,1492 @@
 #include <climits>
 #include <cstdlib>
 #ifndef VML_NO_IOSTREAM
-#  include <sstream>
+#include <sstream>
 #endif
 #include <string>
 
+namespace vecmathlib {
 
+template <typename T, int N> struct boolpseudovec;
+template <typename T, int N> struct intpseudovec;
+template <typename T, int N> struct realpseudovec;
 
-namespace vecmathlib {
-  
-  template<typename T, int N> struct boolpseudovec;
-  template<typename T, int N> struct intpseudovec;
-  template<typename T, int N> struct realpseudovec;
-  
-  
-  
-  template<typename T, int N>
-  struct boolpseudovec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static int const size = N;
-    typedef bool scalar_t;
-    typedef bool bvector_t[size];
-    static int const alignment = sizeof(bool);
-    
-    typedef boolpseudovec boolvec_t;
-    typedef intpseudovec<real_t, size> intvec_t;
-    typedef realpseudovec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolpseudovec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolpseudovec(boolpseudovec const& x): v(x.v) {}
-    // boolpseudovec& operator=(boolpseudovec const& x) { return v=x.v, *this; }
-    boolpseudovec(bool a) { for (int d=0; d<size; ++d) v[d]=a; }
-    boolpseudovec(bool const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-    
-    bool operator[](int n) const { return v[n]; }
-    boolvec_t& set_elt(int n, bool a) { return v[n]=a, *this; }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intpseudovec
-    intvec_t convert_int() const; // defined after intpseudovec
-    
-    
-    
-    boolvec_t operator!() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = !v[d];
-      return res;
-    }
-    
-    boolvec_t operator&&(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] && x.v[d];
-      return res;
-    }
-    boolvec_t operator||(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] || x.v[d];
-      return res;
-    }
-    boolvec_t operator==(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
-      return res;
-    }
-    boolvec_t operator!=(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
-      return res;
-    }
-    
-    bool all() const
-    {
-      bool res = v[0];
-      for (int d=1; d<size; ++d) res = res && v[d];
-      return res;
-    }
-    bool any() const
-    {
-      bool res = v[0];
-      for (int d=1; d<size; ++d) res = res || v[d];
-      return res;
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intpseudovec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realpseudovec
-  };
-  
-  
-  
-  template<typename T, int N>
-  struct intpseudovec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static int const size = N;
-    typedef int_t scalar_t;
-    typedef int_t ivector_t[size];
-    static int const alignment = sizeof(int_t);
-    
-    typedef boolpseudovec<real_t, size> boolvec_t;
-    typedef intpseudovec intvec_t;
-    typedef realpseudovec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intpseudovec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intpseudovec(intpseudovec const& x): v(x.v) {}
-    // intpseudovec& operator=(intpseudovec const& x) { return v=x.v, *this; }
-    intpseudovec(int_t a) { for (int d=0; d<size; ++d) v[d]=a; }
-    intpseudovec(int_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-    static intvec_t iota()
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d]=d;
-      return res;
-    }
-    
-    int_t operator[](int n) const { return v[n]; }
-    intvec_t& set_elt(int n, int_t a) { return v[n]=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d]=v[d];
-      return res;
-    }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d];
-      return res;
-    }
-    realvec_t as_float() const;      // defined after realpseudovec
-    realvec_t convert_float() const; // defined after realpseudovec
-    
-    
-    
-    intvec_t operator+() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = + v[d];
-      return res;
-    }
-    intvec_t operator-() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = - v[d];
-      return res;
-    }
-    
-    intvec_t& operator+=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] += x.v[d];
-      return *this;
-    }
-    intvec_t& operator-=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] -= x.v[d];
-      return *this;
-    }
-    intvec_t& operator*=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] *= x.v[d];
-      return *this;
-    }
-    intvec_t& operator/=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] /= x.v[d];
-      return *this;
-    }
-    intvec_t& operator%=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] %= x.v[d];
-      return *this;
-    }
-    
-    intvec_t operator+(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res += x;
-    }
-    intvec_t operator-(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res -= x;
-    }
-    intvec_t operator*(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res *= x;
-    }
-    intvec_t operator/(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res /= x;
-    }
-    intvec_t operator%(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res %= x;
-    }
-    
-    
-    
-    intvec_t operator~() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = ~ v[d];
-      return res;
-    }
-    
-    intvec_t& operator&=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] &= x.v[d];
-      return *this;
-    }
-    intvec_t& operator|=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] |= x.v[d];
-      return *this;
-    }
-    intvec_t& operator^=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] ^= x.v[d];
-      return *this;
-    }
-    
-    intvec_t operator&(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res &= x;
-    }
-    intvec_t operator|(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res |= x;
-    }
-    intvec_t operator^(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res ^= x;
-    }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n));
-      return res;
-    }
-    intvec_t rotate(int_t n) const;
-    intvec_t& operator>>=(int_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] >>= n;
-      return *this;
-    }
-    intvec_t& operator<<=(int_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] <<= n;
-      return *this;
-    }
-    intvec_t operator>>(int_t n) const
-    {
-      intvec_t res = *this;
-      return res >>= n;
-    }
-    intvec_t operator<<(int_t n) const
-    {
-      intvec_t res = *this;
-      return res <<= n;
-    }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n.v[d]));
-      return res;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t& operator>>=(intvec_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] >>= n.v[d];
-      return *this;
-    }
-    intvec_t& operator<<=(intvec_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] <<= n.v[d];
-      return *this;
-    }
-    intvec_t operator>>(intvec_t n) const
-    {
-      intvec_t res = *this;
-      return res >>= n;
-    }
-    intvec_t operator<<(intvec_t n) const
-    {
-      intvec_t res = *this;
-      return res <<= n;
-    }
-    
-    intvec_t clz() const
-    {
-      intvec_t res;
+template <typename T, int N> struct boolpseudovec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static int const size = N;
+  typedef bool scalar_t;
+  typedef bool bvector_t[size];
+  static int const alignment = sizeof(bool);
+
+  typedef boolpseudovec boolvec_t;
+  typedef intpseudovec<real_t, size> intvec_t;
+  typedef realpseudovec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolpseudovec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolpseudovec(boolpseudovec const& x): v(x.v) {}
+  // boolpseudovec& operator=(boolpseudovec const& x) { return v=x.v, *this; }
+  boolpseudovec(bool a) {
+    for (int d = 0; d < size; ++d)
+      v[d] = a;
+  }
+  boolpseudovec(bool const *as) {
+    for (int d = 0; d < size; ++d)
+      v[d] = as[d];
+  }
+
+  bool operator[](int n) const { return v[n]; }
+  boolvec_t &set_elt(int n, bool a) { return v[n] = a, *this; }
+
+  intvec_t as_int() const;      // defined after intpseudovec
+  intvec_t convert_int() const; // defined after intpseudovec
+
+  boolvec_t operator!() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = !v[d];
+    return res;
+  }
+
+  boolvec_t operator&&(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] && x.v[d];
+    return res;
+  }
+  boolvec_t operator||(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] || x.v[d];
+    return res;
+  }
+  boolvec_t operator==(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] == x.v[d];
+    return res;
+  }
+  boolvec_t operator!=(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] != x.v[d];
+    return res;
+  }
+
+  bool all() const {
+    bool res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = res && v[d];
+    return res;
+  }
+  bool any() const {
+    bool res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = res || v[d];
+    return res;
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intpseudovec
+  realvec_t ifthen(realvec_t x,
+                   realvec_t y) const; // defined after realpseudovec
+};
+
+template <typename T, int N> struct intpseudovec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static int const size = N;
+  typedef int_t scalar_t;
+  typedef int_t ivector_t[size];
+  static int const alignment = sizeof(int_t);
+
+  typedef boolpseudovec<real_t, size> boolvec_t;
+  typedef intpseudovec intvec_t;
+  typedef realpseudovec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intpseudovec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intpseudovec(intpseudovec const& x): v(x.v) {}
+  // intpseudovec& operator=(intpseudovec const& x) { return v=x.v, *this; }
+  intpseudovec(int_t a) {
+    for (int d = 0; d < size; ++d)
+      v[d] = a;
+  }
+  intpseudovec(int_t const *as) {
+    for (int d = 0; d < size; ++d)
+      v[d] = as[d];
+  }
+  static intvec_t iota() {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = d;
+    return res;
+  }
+
+  int_t operator[](int n) const { return v[n]; }
+  intvec_t &set_elt(int n, int_t a) { return v[n] = a, *this; }
+
+  boolvec_t as_bool() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d];
+    return res;
+  }
+  boolvec_t convert_bool() const {
+    // Result: convert_bool(0)=false, convert_bool(else)=true
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d];
+    return res;
+  }
+  realvec_t as_float() const;      // defined after realpseudovec
+  realvec_t convert_float() const; // defined after realpseudovec
+
+  intvec_t operator+() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = +v[d];
+    return res;
+  }
+  intvec_t operator-() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = -v[d];
+    return res;
+  }
+
+  intvec_t &operator+=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] += x.v[d];
+    return *this;
+  }
+  intvec_t &operator-=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] -= x.v[d];
+    return *this;
+  }
+  intvec_t &operator*=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] *= x.v[d];
+    return *this;
+  }
+  intvec_t &operator/=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] /= x.v[d];
+    return *this;
+  }
+  intvec_t &operator%=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] %= x.v[d];
+    return *this;
+  }
+
+  intvec_t operator+(intvec_t x) const {
+    intvec_t res = *this;
+    return res += x;
+  }
+  intvec_t operator-(intvec_t x) const {
+    intvec_t res = *this;
+    return res -= x;
+  }
+  intvec_t operator*(intvec_t x) const {
+    intvec_t res = *this;
+    return res *= x;
+  }
+  intvec_t operator/(intvec_t x) const {
+    intvec_t res = *this;
+    return res /= x;
+  }
+  intvec_t operator%(intvec_t x) const {
+    intvec_t res = *this;
+    return res %= x;
+  }
+
+  intvec_t operator~() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = ~v[d];
+    return res;
+  }
+
+  intvec_t &operator&=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] &= x.v[d];
+    return *this;
+  }
+  intvec_t &operator|=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] |= x.v[d];
+    return *this;
+  }
+  intvec_t &operator^=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] ^= x.v[d];
+    return *this;
+  }
+
+  intvec_t operator&(intvec_t x) const {
+    intvec_t res = *this;
+    return res &= x;
+  }
+  intvec_t operator|(intvec_t x) const {
+    intvec_t res = *this;
+    return res |= x;
+  }
+  intvec_t operator^(intvec_t x) const {
+    intvec_t res = *this;
+    return res ^= x;
+  }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = I(U(v[d]) >> U(n));
+    return res;
+  }
+  intvec_t rotate(int_t n) const;
+  intvec_t &operator>>=(int_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] >>= n;
+    return *this;
+  }
+  intvec_t &operator<<=(int_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] <<= n;
+    return *this;
+  }
+  intvec_t operator>>(int_t n) const {
+    intvec_t res = *this;
+    return res >>= n;
+  }
+  intvec_t operator<<(int_t n) const {
+    intvec_t res = *this;
+    return res <<= n;
+  }
+
+  intvec_t lsr(intvec_t n) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = I(U(v[d]) >> U(n.v[d]));
+    return res;
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t &operator>>=(intvec_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] >>= n.v[d];
+    return *this;
+  }
+  intvec_t &operator<<=(intvec_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] <<= n.v[d];
+    return *this;
+  }
+  intvec_t operator>>(intvec_t n) const {
+    intvec_t res = *this;
+    return res >>= n;
+  }
+  intvec_t operator<<(intvec_t n) const {
+    intvec_t res = *this;
+    return res <<= n;
+  }
+
+  intvec_t clz() const {
+    intvec_t res;
 #if defined __clang__ || defined __gcc__
-      for (int d=0; d<size; ++d) {
-        if (v[d] == 0) {
-          res.v[d] = CHAR_BIT * sizeof v[d];
+    for (int d = 0; d < size; ++d) {
+      if (v[d] == 0) {
+        res.v[d] = CHAR_BIT * sizeof v[d];
+      } else {
+        if (sizeof v[d] == sizeof(long long)) {
+          res.v[d] = __builtin_clzll(v[d]);
+        } else if (sizeof v[d] == sizeof(long)) {
+          res.v[d] = __builtin_clzl(v[d]);
+        } else if (sizeof v[d] == sizeof(int)) {
+          res.v[d] = __builtin_clz(v[d]);
+        } else if (sizeof v[d] == sizeof(short)) {
+          res.v[d] = __builtin_clzs(v[d]);
+        } else if (sizeof v[d] == sizeof(char)) {
+          res.v[d] = __builtin_clzs((unsigned short)(unsigned char)v[d]) -
+                     CHAR_BIT * (sizeof(short) - sizeof(char));
         } else {
-          if (sizeof v[d] == sizeof(long long)) {
-            res.v[d] = __builtin_clzll(v[d]);
-          } else if (sizeof v[d] == sizeof(long)) {
-            res.v[d] = __builtin_clzl(v[d]);
-          } else if (sizeof v[d] == sizeof(int)) {
-            res.v[d] = __builtin_clz(v[d]);
-          } else if (sizeof v[d] == sizeof(short)) {
-            res.v[d] = __builtin_clzs(v[d]);
-          } else if (sizeof v[d] == sizeof(char)) {
-            res.v[d] =
-              __builtin_clzs((unsigned short)(unsigned char)v[d]) -
-              CHAR_BIT * (sizeof(short) - sizeof(char));
-          } else {
-            __builtin_unreachable();
-          }
+          __builtin_unreachable();
         }
       }
+    }
 #else
-      res = MF::vml_clz(*this);
+    res = MF::vml_clz(*this);
 #endif
-      return res;
-    }
-    intvec_t popcount() const
-    {
-      intvec_t res;
+    return res;
+  }
+  intvec_t popcount() const {
+    intvec_t res;
 #if defined __clang__ || defined __gcc__
-      if (sizeof(int_t) == sizeof(long long)) {
-        for (int d=0; d<size; ++d) res.v[d] = __builtin_popcountll(v[d]);
-      } else if (sizeof(int_t) == sizeof(long)) {
-        for (int d=0; d<size; ++d) res.v[d] = __builtin_popcountl(v[d]);
-      } else if (sizeof(int_t) <= sizeof(int)) {
-        for (int d=0; d<size; ++d) res.v[d] = __builtin_popcount(v[d]);
-      } else {
-        __builtin_unreachable();
-      }
+    if (sizeof(int_t) == sizeof(long long)) {
+      for (int d = 0; d < size; ++d)
+        res.v[d] = __builtin_popcountll(v[d]);
+    } else if (sizeof(int_t) == sizeof(long)) {
+      for (int d = 0; d < size; ++d)
+        res.v[d] = __builtin_popcountl(v[d]);
+    } else if (sizeof(int_t) <= sizeof(int)) {
+      for (int d = 0; d < size; ++d)
+        res.v[d] = __builtin_popcount(v[d]);
+    } else {
+      __builtin_unreachable();
+    }
 #else
-      res = MF::vml_popcount(*this);
+    res = MF::vml_popcount(*this);
 #endif
-      return res;
-    }
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
-      return res;
-    }
-    boolvec_t operator!=(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
-      return res;
-    }
-    boolvec_t operator<(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d];
-      return res;
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d];
-      return res;
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d];
-      return res;
-    }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d];
-      return res;
-    }
-    
-    intvec_t abs() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = std::abs(v[d]);
-      return res;
-    }
-    
-    boolvec_t isignbit() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] < 0;
-      return res;
-    }
-    
-    intvec_t max(intvec_t x) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = std::max(v[d], x.v[d]);
-      return res;
-    }
-    
-    intvec_t min(intvec_t x) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = std::min(v[d], x.v[d]);
-      return res;
-    }
-  };
-  
-  
-  
-  template<typename T, int N>
-  struct realpseudovec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static int const size = N;
-    typedef real_t scalar_t;
-    typedef real_t vector_t[size];
-    static int const alignment = sizeof(real_t);
-    
+    return res;
+  }
+
+  boolvec_t operator==(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] == x.v[d];
+    return res;
+  }
+  boolvec_t operator!=(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] != x.v[d];
+    return res;
+  }
+  boolvec_t operator<(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] < x.v[d];
+    return res;
+  }
+  boolvec_t operator<=(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] <= x.v[d];
+    return res;
+  }
+  boolvec_t operator>(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] > x.v[d];
+    return res;
+  }
+  boolvec_t operator>=(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] >= x.v[d];
+    return res;
+  }
+
+  intvec_t abs() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = std::abs(v[d]);
+    return res;
+  }
+
+  boolvec_t isignbit() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] < 0;
+    return res;
+  }
+
+  intvec_t max(intvec_t x) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = std::max(v[d], x.v[d]);
+    return res;
+  }
+
+  intvec_t min(intvec_t x) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = std::min(v[d], x.v[d]);
+    return res;
+  }
+};
+
+template <typename T, int N> struct realpseudovec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static int const size = N;
+  typedef real_t scalar_t;
+  typedef real_t vector_t[size];
+  static int const alignment = sizeof(real_t);
+
 #ifndef VML_NO_IOSTREAM
-    static char const* name()
-    {
-      static std::string name_;
-      if (name_.empty()) {
-        std::stringstream buf;
-        buf << "<libm:" << N << "*" << FP::name() << ">";
-        name_ = buf.str();
-      }
-      return name_.c_str();
+  static char const *name() {
+    static std::string name_;
+    if (name_.empty()) {
+      std::stringstream buf;
+      buf << "<libm:" << N << "*" << FP::name() << ">";
+      name_ = buf.str();
     }
+    return name_.c_str();
+  }
 #endif
-    void barrier()
-    {
+  void barrier() {
 #if defined __GNUC__ && !defined __clang__ && !defined __ICC
-      // GCC crashes when +X is used as constraint
-#  if defined __SSE2__
-      for (int d=0; d<size; ++d) __asm__("": "+x"(v[d]));
-#  elif defined __PPC64__       // maybe also __PPC__
-      for (int d=0; d<size; ++d) __asm__("": "+f"(v[d]));
-#  elif defined __arm__
-      for (int d=0; d<size; ++d) __asm__("": "+w"(v[d]));
-#  else
-#    error "Floating point barrier undefined on this architecture"
-#  endif
+// GCC crashes when +X is used as constraint
+#if defined __SSE2__
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+x"(v[d]));
+#elif defined __PPC64__ // maybe also __PPC__
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+f"(v[d]));
+#elif defined __arm__
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+w"(v[d]));
+#else
+#error "Floating point barrier undefined on this architecture"
+#endif
 #elif defined __clang__
-      for (int d=0; d<size; ++d) __asm__("": "+X"(v[d]));
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+X"(v[d]));
 #elif defined __ICC
-      for (int d=0; d<size; ++d) {
-        real_t tmp = v[d];
-        __asm__("": "+X"(tmp));
-        v[d] = tmp;
-      }
+    for (int d = 0; d < size; ++d) {
+      real_t tmp = v[d];
+      __asm__("" : "+X"(tmp));
+      v[d] = tmp;
+    }
 #elif defined __IBMCPP__
-      for (int d=0; d<size; ++d) __asm__("": "+f"(v[d]));
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+f"(v[d]));
 #else
-#  error "Floating point barrier undefined on this architecture"
+#error "Floating point barrier undefined on this architecture"
 #endif
-    }
-    
-    typedef boolpseudovec<real_t, size> boolvec_t;
-    typedef intpseudovec<real_t, size> intvec_t;
-    typedef realpseudovec realvec_t;
-    
-  private:
-    boolvec_t mapb(bool f(real_t)) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
-      return res;
-    }
-    intvec_t map(int_t f(real_t)) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t)) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t, int_t), intvec_t x) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t, real_t), realvec_t x) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]);
-      return res;
-    }
-    realvec_t map(real_t f(real_t, real_t, real_t),
-                  realvec_t x, realvec_t y) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d], y.v[d]);
-      return res;
-    }
-  public:
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realpseudovec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realpseudovec(realpseudovec const& x): v(x.v) {}
-    // realpseudovec& operator=(realpseudovec const& x) { return v=x.v, *this; }
-    realpseudovec(real_t a) { for (int d=0; d<size; ++d) v[d]=a; }
-    realpseudovec(real_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-    
-    real_t operator[](int n) const { return v[n]; }
-    realvec_t& set_elt(int n, real_t a) { return v[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loadu(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = p[d];
-      return res;
-    }
-    static realvec_t loadu(real_t const* p, size_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      return m.m.ifthen(loada(p), *this);
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      return m.m.ifthen(loadu(p), *this);
-    }
-    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
-    {
-      return m.m.ifthen(loadu(p, ioff), *this);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p);
-    }
-    void storeu(real_t* p) const
-    {
-      for (int d=0; d<size; ++d) p[d] = v[d];
-    }
-    void storeu(real_t* p, size_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p, m);
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      for (int d=0; d<size; ++d) if (m.m[d]) p[d] = v[d];
-    }
-    void storeu(real_t* p, size_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = FP::as_int(v[d]);
-      return res;
-    }
-    intvec_t convert_int() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = FP::convert_int(v[d]);
-      return res;
-    }
-    
-    
-    
-    realvec_t operator+() const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = + v[d];
-      return res;
-    }
-    realvec_t operator-() const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = - v[d];
-      return res;
-    }
-    
-    realvec_t& operator+=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] += x.v[d];
-      return *this;
-    }
-    realvec_t& operator-=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] -= x.v[d];
-      return *this;
-    }
-    realvec_t& operator*=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] *= x.v[d];
-      return *this;
-    }
-    realvec_t& operator/=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] /= x.v[d];
-      return *this;
-    }
-    
-    realvec_t operator+(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res += x;
-    }
-    realvec_t operator-(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res -= x;
-    }
-    realvec_t operator*(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res *= x;
-    }
-    realvec_t operator/(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res /= x;
-    }
-    
-    real_t maxval() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res = vml_std::fmax(res, v[d]);
-      return res;
-    }
-    real_t minval() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res = vml_std::fmin(res, v[d]);
-      return res;
-    }
-    real_t prod() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res *= v[d];
-      return res;
-    }
-    real_t sum() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res += v[d];
-      return res;
-    }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
-      return res;
-    }
-    boolvec_t operator!=(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
-      return res;
-    }
-    boolvec_t operator<(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d];
-      return res;
-    }
-    boolvec_t operator<=(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d];
-      return res;
-    }
-    boolvec_t operator>(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d];
-      return res;
-    }
-    boolvec_t operator>=(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d];
-      return res;
-    }
-    
-    
-    
-    realvec_t acos() const { return map(vml_std::acos); }
-    realvec_t acosh() const { return map(vml_std::acosh); }
-    realvec_t asin() const { return map(vml_std::asin); }
-    realvec_t asinh() const { return map(vml_std::asinh); }
-    realvec_t atan() const { return map(vml_std::atan); }
-    realvec_t atan2(realvec_t y) const
-    {
-      return MF::vml_atan2(*this, y);
-    }
-    realvec_t atanh() const { return map(vml_std::atanh); }
-    realvec_t cbrt() const { return map(vml_std::cbrt); }
-    realvec_t ceil() const { return map(vml_std::ceil); }
-    realvec_t copysign(realvec_t y) const
-    {
-      return map(vml_std::copysign, y);
-    }
-    realvec_t cos() const { return map(vml_std::cos); }
-    realvec_t cosh() const { return map(vml_std::cosh); }
-    realvec_t exp() const { return map(vml_std::exp); }
-    realvec_t exp10() const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = vml_std::exp(R(M_LN10) * v[d]);
-      return res;
-    }
-    realvec_t exp2() const { return map(vml_std::exp2); }
-    realvec_t expm1() const { return map(vml_std::expm1); }
-    realvec_t fabs() const { return map(vml_std::fabs); }
-    realvec_t fdim(realvec_t y) const { return map(vml_std::fdim, y); }
-    realvec_t floor() const { return map(vml_std::floor); }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return map(vml_std::fma, y, z);
-    }
-    realvec_t fmax(realvec_t y) const { return map(vml_std::fmax, y); }
-    realvec_t fmin(realvec_t y) const { return map(vml_std::fmin, y); }
-    realvec_t fmod(realvec_t y) const { return map(vml_std::fmod, y); }
-    realvec_t frexp(intvec_t* ires) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) {
-        int iri;
-        real_t r = vml_std::frexp(v[d], &iri);
-        int_t ir = iri;
+  }
+
+  typedef boolpseudovec<real_t, size> boolvec_t;
+  typedef intpseudovec<real_t, size> intvec_t;
+  typedef realpseudovec realvec_t;
+
+private:
+  boolvec_t mapb(bool f(real_t)) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d]);
+    return res;
+  }
+  intvec_t map(int_t f(real_t)) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t)) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, int_t), intvec_t x) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d], x.v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, real_t), realvec_t x) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d], x.v[d]);
+    return res;
+  }
+  realvec_t map(real_t f(real_t, real_t, real_t), realvec_t x,
+                realvec_t y) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = f(v[d], x.v[d], y.v[d]);
+    return res;
+  }
+
+public:
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realpseudovec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realpseudovec(realpseudovec const& x): v(x.v) {}
+  // realpseudovec& operator=(realpseudovec const& x) { return v=x.v, *this; }
+  realpseudovec(real_t a) {
+    for (int d = 0; d < size; ++d)
+      v[d] = a;
+  }
+  realpseudovec(real_t const *as) {
+    for (int d = 0; d < size; ++d)
+      v[d] = as[d];
+  }
+
+  real_t operator[](int n) const { return v[n]; }
+  realvec_t &set_elt(int n, real_t a) { return v[n] = a, *this; }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loadu(p);
+  }
+  static realvec_t loadu(real_t const *p) {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = p[d];
+    return res;
+  }
+  static realvec_t loadu(real_t const *p, size_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    return m.m.ifthen(loada(p), *this);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    return m.m.ifthen(loadu(p), *this);
+  }
+  realvec_t loadu(real_t const *p, size_t ioff, mask_t const &m) const {
+    return m.m.ifthen(loadu(p, ioff), *this);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p);
+  }
+  void storeu(real_t *p) const {
+    for (int d = 0; d < size; ++d)
+      p[d] = v[d];
+  }
+  void storeu(real_t *p, size_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p, m);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    for (int d = 0; d < size; ++d)
+      if (m.m[d])
+        p[d] = v[d];
+  }
+  void storeu(real_t *p, size_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = FP::as_int(v[d]);
+    return res;
+  }
+  intvec_t convert_int() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = FP::convert_int(v[d]);
+    return res;
+  }
+
+  realvec_t operator+() const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = +v[d];
+    return res;
+  }
+  realvec_t operator-() const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = -v[d];
+    return res;
+  }
+
+  realvec_t &operator+=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] += x.v[d];
+    return *this;
+  }
+  realvec_t &operator-=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] -= x.v[d];
+    return *this;
+  }
+  realvec_t &operator*=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] *= x.v[d];
+    return *this;
+  }
+  realvec_t &operator/=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] /= x.v[d];
+    return *this;
+  }
+
+  realvec_t operator+(realvec_t x) const {
+    realvec_t res = *this;
+    return res += x;
+  }
+  realvec_t operator-(realvec_t x) const {
+    realvec_t res = *this;
+    return res -= x;
+  }
+  realvec_t operator*(realvec_t x) const {
+    realvec_t res = *this;
+    return res *= x;
+  }
+  realvec_t operator/(realvec_t x) const {
+    realvec_t res = *this;
+    return res /= x;
+  }
+
+  real_t maxval() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = vml_std::fmax(res, v[d]);
+    return res;
+  }
+  real_t minval() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = vml_std::fmin(res, v[d]);
+    return res;
+  }
+  real_t prod() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res *= v[d];
+    return res;
+  }
+  real_t sum() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res += v[d];
+    return res;
+  }
+
+  boolvec_t operator==(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] == x.v[d];
+    return res;
+  }
+  boolvec_t operator!=(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] != x.v[d];
+    return res;
+  }
+  boolvec_t operator<(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] < x.v[d];
+    return res;
+  }
+  boolvec_t operator<=(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] <= x.v[d];
+    return res;
+  }
+  boolvec_t operator>(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] > x.v[d];
+    return res;
+  }
+  boolvec_t operator>=(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] >= x.v[d];
+    return res;
+  }
+
+  realvec_t acos() const { return map(vml_std::acos); }
+  realvec_t acosh() const { return map(vml_std::acosh); }
+  realvec_t asin() const { return map(vml_std::asin); }
+  realvec_t asinh() const { return map(vml_std::asinh); }
+  realvec_t atan() const { return map(vml_std::atan); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return map(vml_std::atanh); }
+  realvec_t cbrt() const { return map(vml_std::cbrt); }
+  realvec_t ceil() const { return map(vml_std::ceil); }
+  realvec_t copysign(realvec_t y) const { return map(vml_std::copysign, y); }
+  realvec_t cos() const { return map(vml_std::cos); }
+  realvec_t cosh() const { return map(vml_std::cosh); }
+  realvec_t exp() const { return map(vml_std::exp); }
+  realvec_t exp10() const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = vml_std::exp(R(M_LN10) * v[d]);
+    return res;
+  }
+  realvec_t exp2() const { return map(vml_std::exp2); }
+  realvec_t expm1() const { return map(vml_std::expm1); }
+  realvec_t fabs() const { return map(vml_std::fabs); }
+  realvec_t fdim(realvec_t y) const { return map(vml_std::fdim, y); }
+  realvec_t floor() const { return map(vml_std::floor); }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return map(vml_std::fma, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return map(vml_std::fmax, y); }
+  realvec_t fmin(realvec_t y) const { return map(vml_std::fmin, y); }
+  realvec_t fmod(realvec_t y) const { return map(vml_std::fmod, y); }
+  realvec_t frexp(intvec_t *ires) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d) {
+      int iri;
+      real_t r = vml_std::frexp(v[d], &iri);
+      int_t ir = iri;
 #if defined VML_HAVE_INF
-        if (vml_std::isinf(v[d])) ir = std::numeric_limits<int_t>::max();
+      if (vml_std::isinf(v[d]))
+        ir = std::numeric_limits<int_t>::max();
 #endif
 #if defined VML_HAVE_NAN
-        if (vml_std::isnan(v[d])) ir = std::numeric_limits<int_t>::min();
+      if (vml_std::isnan(v[d]))
+        ir = std::numeric_limits<int_t>::min();
 #endif
-        res.v[d] = r;
-        ires->v[d] = ir;
-      }
-      return res;
+      res.v[d] = r;
+      ires->v[d] = ir;
     }
-    realvec_t hypot(realvec_t y) const { return map(vml_std::hypot, y); }
-    intvec_t ilogb() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) {
-        int_t r = vml_std::ilogb(v[d]);
-        typedef std::numeric_limits<int_t> NL;
-        if (FP_ILOGB0 != NL::min() and v[d] == R(0.0)) {
-          r = NL::min();
+    return res;
+  }
+  realvec_t hypot(realvec_t y) const { return map(vml_std::hypot, y); }
+  intvec_t ilogb() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d) {
+      int_t r = vml_std::ilogb(v[d]);
+      typedef std::numeric_limits<int_t> NL;
+      if (FP_ILOGB0 != NL::min() and v[d] == R(0.0)) {
+        r = NL::min();
 #if defined VML_HAVE_INF
-        } else if (INT_MAX != NL::max() and vml_std::isinf(v[d])) {
-          r = NL::max();
+      } else if (INT_MAX != NL::max() and vml_std::isinf(v[d])) {
+        r = NL::max();
 #endif
 #if defined VML_HAVE_NAN
-        } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v[d])) {
-          r = NL::min();
+      } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v[d])) {
+        r = NL::min();
 #endif
-        }
-        res.v[d] = r;
       }
-      return res;
+      res.v[d] = r;
     }
-    boolvec_t isfinite() const { return mapb(vml_std::isfinite); }
-    boolvec_t isinf() const { return mapb(vml_std::isinf); }
-    boolvec_t isnan() const { return mapb(vml_std::isnan); }
-    boolvec_t isnormal() const { return mapb(vml_std::isnormal); }
-    realvec_t ldexp(int_t n) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = vml_std::ldexp(v[d], n);
-      return res;
-    }
-    realvec_t ldexp(intvec_t n) const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = vml_std::ldexp(v[d], n.v[d]);
-      return res;
-    }
-    realvec_t log() const { return map(vml_std::log); }
-    realvec_t log10() const { return map(vml_std::log10); }
-    realvec_t log1p() const { return map(vml_std::log1p); }
-    realvec_t log2() const { return map(vml_std::log2); }
-    intvec_t lrint() const
-    {
-      realvec_t res;
-      if (sizeof(int_t) <= sizeof(long)) {
-        for (int d=0; d<size; ++d) res.v[d] = vml_std::lrint(v[d]);
-      } else if (sizeof(int_t) <= sizeof(long long)) {
-        for (int d=0; d<size; ++d) res.v[d] = vml_std::llrint(v[d]);
-      } else {
-        __builtin_unreachable();
-      }
-      return res;
-    }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return map(vml_std::nextafter, y);
-    }
-    realvec_t pow(realvec_t y) const { return map(vml_std::pow, y); }
-    realvec_t rcp() const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = R(1.0) / v[d];
-      return res;
-    }
-    realvec_t remainder(realvec_t y) const
-    {
-      return map(vml_std::remainder, y);
-    }
-    realvec_t rint() const { return map(vml_std::rint); }
-    realvec_t round() const { return map(vml_std::round); }
-    realvec_t rsqrt() const { return sqrt().rcp(); }
-    boolvec_t signbit() const { return mapb(vml_std::signbit); }
-    realvec_t sin() const { return map(vml_std::sin); }
-    realvec_t sinh() const { return map(vml_std::sinh); }
-    realvec_t sqrt() const { return map(vml_std::sqrt); }
-    realvec_t tan() const { return map(vml_std::tan); }
-    realvec_t tanh() const { return map(vml_std::tanh); }
-    realvec_t trunc() const { return map(vml_std::trunc); }
-  };
-  
-  
-  
-  // boolpseudovec definitions
-  
-  template<typename T, int N>
-  inline
-  typename boolpseudovec<T,N>::intvec_t boolpseudovec<T,N>::as_int() const
-  {
-    return convert_int();
-  }
-  
-  template<typename T, int N>
-  inline
-  typename boolpseudovec<T,N>::intvec_t boolpseudovec<T,N>::convert_int() const
-  {
-    intvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d];
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename boolpseudovec<T,N>::boolvec_t
-  boolpseudovec<T,N>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    boolvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
-    return res;
-  }
-  
-  template<typename T, int N>
-  inline
-  typename boolpseudovec<T,N>::intvec_t
-  boolpseudovec<T,N>::ifthen(intvec_t x, intvec_t y) const
-  {
-    intvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+  boolvec_t isfinite() const { return mapb(vml_std::isfinite); }
+  boolvec_t isinf() const { return mapb(vml_std::isinf); }
+  boolvec_t isnan() const { return mapb(vml_std::isnan); }
+  boolvec_t isnormal() const { return mapb(vml_std::isnormal); }
+  realvec_t ldexp(int_t n) const {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = vml_std::ldexp(v[d], n);
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename boolpseudovec<T,N>::realvec_t
-  boolpseudovec<T,N>::ifthen(realvec_t x, realvec_t y) const
-  {
+  realvec_t ldexp(intvec_t n) const {
     realvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+    for (int d = 0; d < size; ++d)
+      res.v[d] = vml_std::ldexp(v[d], n.v[d]);
     return res;
   }
-  
-  
-  
-  // intpseudovec definitions
-  
-  template<typename T, int N>
-  inline
-  typename intpseudovec<T,N>::realvec_t intpseudovec<T,N>::as_float() const
-  {
+  realvec_t log() const { return map(vml_std::log); }
+  realvec_t log10() const { return map(vml_std::log10); }
+  realvec_t log1p() const { return map(vml_std::log1p); }
+  realvec_t log2() const { return map(vml_std::log2); }
+  intvec_t lrint() const {
     realvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = FP::as_float(v[d]);
+    if (sizeof(int_t) <= sizeof(long)) {
+      for (int d = 0; d < size; ++d)
+        res.v[d] = vml_std::lrint(v[d]);
+    } else if (sizeof(int_t) <= sizeof(long long)) {
+      for (int d = 0; d < size; ++d)
+        res.v[d] = vml_std::llrint(v[d]);
+    } else {
+      __builtin_unreachable();
+    }
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  intpseudovec<T,N> intpseudovec<T,N>::bitifthen(intvec_t x, intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  template<typename T, int N>
-  inline
-  typename intpseudovec<T,N>::realvec_t intpseudovec<T,N>::convert_float() const
-  {
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return map(vml_std::nextafter, y); }
+  realvec_t pow(realvec_t y) const { return map(vml_std::pow, y); }
+  realvec_t rcp() const {
     realvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = FP::convert_float(v[d]);
+    for (int d = 0; d < size; ++d)
+      res.v[d] = R(1.0) / v[d];
     return res;
   }
-  
-  template<typename T, int N>
-  inline intpseudovec<T,N> intpseudovec<T,N>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  template<typename T, int N>
-  inline intpseudovec<T,N> intpseudovec<T,N>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-
-
-  // Wrappers
-  
-  // boolpseudovec wrappers
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> as_int(boolpseudovec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> convert_int(boolpseudovec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline bool all(boolpseudovec<real_t, size> x) { return x.all(); }
-  
-  template<typename real_t, int size>
-  inline bool any(boolpseudovec<real_t, size> x) { return x.any(); }
-  
-  template<typename real_t, int size>
-  inline
-  boolpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
-                                     boolpseudovec<real_t, size> x,
-                                     boolpseudovec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
-                                    intpseudovec<real_t, size> x,
-                                    intpseudovec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
-                                     realpseudovec<real_t, size> x,
-                                     realpseudovec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  
-  
-  // intpseudovec wrappers
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> abs(intpseudovec<real_t, size> x)
-  {
-    return x.abs();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> as_bool(intpseudovec<real_t, size> x)
-  {
-    return x.as_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> as_float(intpseudovec<real_t, size> x)
-  {
-    return x.as_float();
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> bitifthen(intpseudovec<real_t, size> x,
-                                              intpseudovec<real_t, size> y,
-                                              intpseudovec<real_t, size> z)
-  {
-    return x.bitifthen(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> clz(intpseudovec<real_t, size> x)
-  {
-    return x.clz();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> convert_bool(intpseudovec<real_t, size> x)
-  {
-    return x.convert_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> convert_float(intpseudovec<real_t, size> x)
-  {
-    return x.convert_float();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> isignbit(intpseudovec<real_t, size> x)
-  {
-    return x.isignbit();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intpseudovec<real_t, size> lsr(intpseudovec<real_t, size> x,
-                                 typename intpseudovec<real_t, size>::int_t n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> lsr(intpseudovec<real_t, size> x,
-                                        intpseudovec<real_t, size> n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> max(intpseudovec<real_t, size> x,
-                                        intpseudovec<real_t, size> y)
-  {
-    return x.max(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> min(intpseudovec<real_t, size> x,
-                                        intpseudovec<real_t, size> y)
-  {
-    return x.min(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> popcount(intpseudovec<real_t, size> x)
-  {
-    return x.popcount();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  intpseudovec<real_t, size> rotate(intpseudovec<real_t, size> x,
-                                    typename
-                                    intpseudovec<real_t, size>::int_t n)
-  {
-    return x.rotate(n);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> rotate(intpseudovec<real_t, size> x,
-                                           intpseudovec<real_t, size> n)
-  {
-    return x.rotate(n);
-  }
-  
-  
-  
-  // realpseudovec wrappers
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size>
-  loada(real_t const* p,
-        realpseudovec<real_t, size> x,
-        typename realpseudovec<real_t, size>::mask_t const& m)
-  {
-    return x.loada(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size>
-  loadu(real_t const* p,
-        realpseudovec<real_t, size> x,
-        typename realpseudovec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size>
-  loadu(real_t const* p, size_t ioff,
-        realpseudovec<real_t, size> x,
-        typename realpseudovec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, ioff, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realpseudovec<real_t, size> x, real_t* p)
-  {
-    return x.storea(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realpseudovec<real_t, size> x, real_t* p)
-  {
-    return x.storeu(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff)
-  {
-    return x.storeu(p, ioff);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realpseudovec<real_t, size> x, real_t* p,
-                     typename realpseudovec<real_t, size>::mask_t const& m)
-  {
-    return x.storea(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realpseudovec<real_t, size> x, real_t* p,
-                     typename realpseudovec<real_t, size>::mask_t const& m)
-  {
-    return x.storeu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff,
-                     typename realpseudovec<real_t, size>::mask_t const& m)
-  {
-    return x.storeu(p, ioff, m);
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> as_int(realpseudovec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> convert_int(realpseudovec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t maxval(realpseudovec<real_t, size> x)
-  {
-    return x.maxval();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t minval(realpseudovec<real_t, size> x)
-  {
-    return x.minval();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t prod(realpseudovec<real_t, size> x)
-  {
-    return x.prod();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t sum(realpseudovec<real_t, size> x)
-  {
-    return x.sum();
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> acos(realpseudovec<real_t, size> x)
-  {
-    return x.acos();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> acosh(realpseudovec<real_t, size> x)
-  {
-    return x.acosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> asin(realpseudovec<real_t, size> x)
-  {
-    return x.asin();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> asinh(realpseudovec<real_t, size> x)
-  {
-    return x.asinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> atan(realpseudovec<real_t, size> x)
-  {
-    return x.atan();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> atan2(realpseudovec<real_t, size> x,
-                                           realpseudovec<real_t, size> y)
-  {
-    return x.atan2(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> atanh(realpseudovec<real_t, size> x)
-  {
-    return x.atanh();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> cbrt(realpseudovec<real_t, size> x)
-  {
-    return x.cbrt();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> ceil(realpseudovec<real_t, size> x)
-  {
-    return x.ceil();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> copysign(realpseudovec<real_t, size> x,
-                                              realpseudovec<real_t, size> y)
-  {
-    return x.copysign(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> cos(realpseudovec<real_t, size> x)
-  {
-    return x.cos();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> cosh(realpseudovec<real_t, size> x)
-  {
-    return x.cosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> exp(realpseudovec<real_t, size> x)
-  {
-    return x.exp();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> exp10(realpseudovec<real_t, size> x)
-  {
-    return x.exp10();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> exp2(realpseudovec<real_t, size> x)
-  {
-    return x.exp2();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> expm1(realpseudovec<real_t, size> x)
-  {
-    return x.expm1();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> fabs(realpseudovec<real_t, size> x)
-  {
-    return x.fabs();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> floor(realpseudovec<real_t, size> x)
-  {
-    return x.floor();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> fdim(realpseudovec<real_t, size> x,
-                                          realpseudovec<real_t, size> y)
-  {
-    return x.fdim(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> fma(realpseudovec<real_t, size> x,
-                                         realpseudovec<real_t, size> y,
-                                         realpseudovec<real_t, size> z)
-  {
-    return x.fma(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> fmax(realpseudovec<real_t, size> x,
-                                          realpseudovec<real_t, size> y)
-  {
-    return x.fmax(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> fmin(realpseudovec<real_t, size> x,
-                                          realpseudovec<real_t, size> y)
-  {
-    return x.fmin(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> fmod(realpseudovec<real_t, size> x,
-                                          realpseudovec<real_t, size> y)
-  {
-    return x.fmod(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> frexp(realpseudovec<real_t, size> x,
-                                           intpseudovec<real_t, size>* r)
-  {
-    return x.frexp(r);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> hypot(realpseudovec<real_t, size> x,
-                                           realpseudovec<real_t, size> y)
-  {
-    return x.hypot(y);
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> ilogb(realpseudovec<real_t, size> x)
-  {
-    return x.ilogb();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> isfinite(realpseudovec<real_t, size> x)
-  {
-    return x.isfinite();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> isinf(realpseudovec<real_t, size> x)
-  {
-    return x.isinf();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> isnan(realpseudovec<real_t, size> x)
-  {
-    return x.isnan();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> isnormal(realpseudovec<real_t, size> x)
-  {
-    return x.isnormal();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realpseudovec<real_t, size> ldexp(realpseudovec<real_t, size> x,
-                                    typename intpseudovec<real_t, size>::int_t
-                                    n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realpseudovec<real_t, size> ldexp(realpseudovec<real_t, size> x,
-                                    intpseudovec<real_t, size> n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> log(realpseudovec<real_t, size> x)
-  {
-    return x.log();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> log10(realpseudovec<real_t, size> x)
-  {
-    return x.log10();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> log1p(realpseudovec<real_t, size> x)
-  {
-    return x.log1p();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> log2(realpseudovec<real_t, size> x)
-  {
-    return x.log2();
-  }
-  
-  template<typename real_t, int size>
-  inline intpseudovec<real_t, size> lrint(realpseudovec<real_t, size> x)
-  {
-    return x.lrint();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> mad(realpseudovec<real_t, size> x,
-                                         realpseudovec<real_t, size> y,
-                                         realpseudovec<real_t, size> z)
-  {
-    return x.mad(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> nextafter(realpseudovec<real_t, size> x,
-                                               realpseudovec<real_t, size> y)
-  {
-    return x.nextafter(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> pow(realpseudovec<real_t, size> x,
-                                         realpseudovec<real_t, size> y)
-  {
-    return x.pow(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> rcp(realpseudovec<real_t, size> x)
-  {
-    return x.rcp();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> remainder(realpseudovec<real_t, size> x,
-                                               realpseudovec<real_t, size> y)
-  {
-    return x.remainder(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> rint(realpseudovec<real_t, size> x)
-  {
-    return x.rint();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> round(realpseudovec<real_t, size> x)
-  {
-    return x.round();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> rsqrt(realpseudovec<real_t, size> x)
-  {
-    return x.rsqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline boolpseudovec<real_t, size> signbit(realpseudovec<real_t, size> x)
-  {
-    return x.signbit();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> sin(realpseudovec<real_t, size> x)
-  {
-    return x.sin();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> sinh(realpseudovec<real_t, size> x)
-  {
-    return x.sinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> sqrt(realpseudovec<real_t, size> x)
-  {
-    return x.sqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> tan(realpseudovec<real_t, size> x)
-  {
-    return x.tan();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> tanh(realpseudovec<real_t, size> x)
-  {
-    return x.tanh();
-  }
-  
-  template<typename real_t, int size>
-  inline realpseudovec<real_t, size> trunc(realpseudovec<real_t, size> x)
-  {
-    return x.trunc();
-  }
-  
-  
-  
+  realvec_t remainder(realvec_t y) const { return map(vml_std::remainder, y); }
+  realvec_t rint() const { return map(vml_std::rint); }
+  realvec_t round() const { return map(vml_std::round); }
+  realvec_t rsqrt() const { return sqrt().rcp(); }
+  boolvec_t signbit() const { return mapb(vml_std::signbit); }
+  realvec_t sin() const { return map(vml_std::sin); }
+  realvec_t sinh() const { return map(vml_std::sinh); }
+  realvec_t sqrt() const { return map(vml_std::sqrt); }
+  realvec_t tan() const { return map(vml_std::tan); }
+  realvec_t tanh() const { return map(vml_std::tanh); }
+  realvec_t trunc() const { return map(vml_std::trunc); }
+};
+
+// boolpseudovec definitions
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::intvec_t
+boolpseudovec<T, N>::as_int() const {
+  return convert_int();
+}
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::intvec_t
+boolpseudovec<T, N>::convert_int() const {
+  intvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d];
+  return res;
+}
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::boolvec_t
+boolpseudovec<T, N>::ifthen(boolvec_t x, boolvec_t y) const {
+  boolvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d] ? x.v[d] : y.v[d];
+  return res;
+}
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::intvec_t
+boolpseudovec<T, N>::ifthen(intvec_t x, intvec_t y) const {
+  intvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d] ? x.v[d] : y.v[d];
+  return res;
+}
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::realvec_t
+boolpseudovec<T, N>::ifthen(realvec_t x, realvec_t y) const {
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d] ? x.v[d] : y.v[d];
+  return res;
+}
+
+// intpseudovec definitions
+
+template <typename T, int N>
+inline typename intpseudovec<T, N>::realvec_t
+intpseudovec<T, N>::as_float() const {
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = FP::as_float(v[d]);
+  return res;
+}
+
+template <typename T, int N>
+inline intpseudovec<T, N> intpseudovec<T, N>::bitifthen(intvec_t x,
+                                                        intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+template <typename T, int N>
+inline typename intpseudovec<T, N>::realvec_t
+intpseudovec<T, N>::convert_float() const {
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = FP::convert_float(v[d]);
+  return res;
+}
+
+template <typename T, int N>
+inline intpseudovec<T, N> intpseudovec<T, N>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+template <typename T, int N>
+inline intpseudovec<T, N> intpseudovec<T, N>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+// Wrappers
+
+// boolpseudovec wrappers
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> as_int(boolpseudovec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> convert_int(boolpseudovec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline bool all(boolpseudovec<real_t, size> x) {
+  return x.all();
+}
+
+template <typename real_t, int size>
+inline bool any(boolpseudovec<real_t, size> x) {
+  return x.any();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
+                                          boolpseudovec<real_t, size> x,
+                                          boolpseudovec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
+                                         intpseudovec<real_t, size> x,
+                                         intpseudovec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
+                                          realpseudovec<real_t, size> x,
+                                          realpseudovec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+// intpseudovec wrappers
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> abs(intpseudovec<real_t, size> x) {
+  return x.abs();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> as_bool(intpseudovec<real_t, size> x) {
+  return x.as_bool();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> as_float(intpseudovec<real_t, size> x) {
+  return x.as_float();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> bitifthen(intpseudovec<real_t, size> x,
+                                            intpseudovec<real_t, size> y,
+                                            intpseudovec<real_t, size> z) {
+  return x.bitifthen(y, z);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> clz(intpseudovec<real_t, size> x) {
+  return x.clz();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> convert_bool(intpseudovec<real_t, size> x) {
+  return x.convert_bool();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> convert_float(intpseudovec<real_t, size> x) {
+  return x.convert_float();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isignbit(intpseudovec<real_t, size> x) {
+  return x.isignbit();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size>
+lsr(intpseudovec<real_t, size> x,
+    typename intpseudovec<real_t, size>::int_t n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> lsr(intpseudovec<real_t, size> x,
+                                      intpseudovec<real_t, size> n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> max(intpseudovec<real_t, size> x,
+                                      intpseudovec<real_t, size> y) {
+  return x.max(y);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> min(intpseudovec<real_t, size> x,
+                                      intpseudovec<real_t, size> y) {
+  return x.min(y);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> popcount(intpseudovec<real_t, size> x) {
+  return x.popcount();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size>
+rotate(intpseudovec<real_t, size> x,
+       typename intpseudovec<real_t, size>::int_t n) {
+  return x.rotate(n);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> rotate(intpseudovec<real_t, size> x,
+                                         intpseudovec<real_t, size> n) {
+  return x.rotate(n);
+}
+
+// realpseudovec wrappers
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size>
+loada(real_t const *p, realpseudovec<real_t, size> x,
+      typename realpseudovec<real_t, size>::mask_t const &m) {
+  return x.loada(p, m);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size>
+loadu(real_t const *p, realpseudovec<real_t, size> x,
+      typename realpseudovec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, m);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size>
+loadu(real_t const *p, size_t ioff, realpseudovec<real_t, size> x,
+      typename realpseudovec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline void storea(realpseudovec<real_t, size> x, real_t *p) {
+  return x.storea(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realpseudovec<real_t, size> x, real_t *p) {
+  return x.storeu(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realpseudovec<real_t, size> x, real_t *p, size_t ioff) {
+  return x.storeu(p, ioff);
+}
+
+template <typename real_t, int size>
+inline void storea(realpseudovec<real_t, size> x, real_t *p,
+                   typename realpseudovec<real_t, size>::mask_t const &m) {
+  return x.storea(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realpseudovec<real_t, size> x, real_t *p,
+                   typename realpseudovec<real_t, size>::mask_t const &m) {
+  return x.storeu(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realpseudovec<real_t, size> x, real_t *p, size_t ioff,
+                   typename realpseudovec<real_t, size>::mask_t const &m) {
+  return x.storeu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> as_int(realpseudovec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> convert_int(realpseudovec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline real_t maxval(realpseudovec<real_t, size> x) {
+  return x.maxval();
+}
+
+template <typename real_t, int size>
+inline real_t minval(realpseudovec<real_t, size> x) {
+  return x.minval();
+}
+
+template <typename real_t, int size>
+inline real_t prod(realpseudovec<real_t, size> x) {
+  return x.prod();
+}
+
+template <typename real_t, int size>
+inline real_t sum(realpseudovec<real_t, size> x) {
+  return x.sum();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> acos(realpseudovec<real_t, size> x) {
+  return x.acos();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> acosh(realpseudovec<real_t, size> x) {
+  return x.acosh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> asin(realpseudovec<real_t, size> x) {
+  return x.asin();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> asinh(realpseudovec<real_t, size> x) {
+  return x.asinh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> atan(realpseudovec<real_t, size> x) {
+  return x.atan();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> atan2(realpseudovec<real_t, size> x,
+                                         realpseudovec<real_t, size> y) {
+  return x.atan2(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> atanh(realpseudovec<real_t, size> x) {
+  return x.atanh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> cbrt(realpseudovec<real_t, size> x) {
+  return x.cbrt();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> ceil(realpseudovec<real_t, size> x) {
+  return x.ceil();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> copysign(realpseudovec<real_t, size> x,
+                                            realpseudovec<real_t, size> y) {
+  return x.copysign(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> cos(realpseudovec<real_t, size> x) {
+  return x.cos();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> cosh(realpseudovec<real_t, size> x) {
+  return x.cosh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> exp(realpseudovec<real_t, size> x) {
+  return x.exp();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> exp10(realpseudovec<real_t, size> x) {
+  return x.exp10();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> exp2(realpseudovec<real_t, size> x) {
+  return x.exp2();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> expm1(realpseudovec<real_t, size> x) {
+  return x.expm1();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fabs(realpseudovec<real_t, size> x) {
+  return x.fabs();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> floor(realpseudovec<real_t, size> x) {
+  return x.floor();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fdim(realpseudovec<real_t, size> x,
+                                        realpseudovec<real_t, size> y) {
+  return x.fdim(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fma(realpseudovec<real_t, size> x,
+                                       realpseudovec<real_t, size> y,
+                                       realpseudovec<real_t, size> z) {
+  return x.fma(y, z);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fmax(realpseudovec<real_t, size> x,
+                                        realpseudovec<real_t, size> y) {
+  return x.fmax(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fmin(realpseudovec<real_t, size> x,
+                                        realpseudovec<real_t, size> y) {
+  return x.fmin(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fmod(realpseudovec<real_t, size> x,
+                                        realpseudovec<real_t, size> y) {
+  return x.fmod(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> frexp(realpseudovec<real_t, size> x,
+                                         intpseudovec<real_t, size> *r) {
+  return x.frexp(r);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> hypot(realpseudovec<real_t, size> x,
+                                         realpseudovec<real_t, size> y) {
+  return x.hypot(y);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> ilogb(realpseudovec<real_t, size> x) {
+  return x.ilogb();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isfinite(realpseudovec<real_t, size> x) {
+  return x.isfinite();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isinf(realpseudovec<real_t, size> x) {
+  return x.isinf();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isnan(realpseudovec<real_t, size> x) {
+  return x.isnan();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isnormal(realpseudovec<real_t, size> x) {
+  return x.isnormal();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size>
+ldexp(realpseudovec<real_t, size> x,
+      typename intpseudovec<real_t, size>::int_t n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> ldexp(realpseudovec<real_t, size> x,
+                                         intpseudovec<real_t, size> n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> log(realpseudovec<real_t, size> x) {
+  return x.log();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> log10(realpseudovec<real_t, size> x) {
+  return x.log10();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> log1p(realpseudovec<real_t, size> x) {
+  return x.log1p();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> log2(realpseudovec<real_t, size> x) {
+  return x.log2();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> lrint(realpseudovec<real_t, size> x) {
+  return x.lrint();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> mad(realpseudovec<real_t, size> x,
+                                       realpseudovec<real_t, size> y,
+                                       realpseudovec<real_t, size> z) {
+  return x.mad(y, z);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> nextafter(realpseudovec<real_t, size> x,
+                                             realpseudovec<real_t, size> y) {
+  return x.nextafter(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> pow(realpseudovec<real_t, size> x,
+                                       realpseudovec<real_t, size> y) {
+  return x.pow(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> rcp(realpseudovec<real_t, size> x) {
+  return x.rcp();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> remainder(realpseudovec<real_t, size> x,
+                                             realpseudovec<real_t, size> y) {
+  return x.remainder(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> rint(realpseudovec<real_t, size> x) {
+  return x.rint();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> round(realpseudovec<real_t, size> x) {
+  return x.round();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> rsqrt(realpseudovec<real_t, size> x) {
+  return x.rsqrt();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> signbit(realpseudovec<real_t, size> x) {
+  return x.signbit();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> sin(realpseudovec<real_t, size> x) {
+  return x.sin();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> sinh(realpseudovec<real_t, size> x) {
+  return x.sinh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> sqrt(realpseudovec<real_t, size> x) {
+  return x.sqrt();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> tan(realpseudovec<real_t, size> x) {
+  return x.tan();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> tanh(realpseudovec<real_t, size> x) {
+  return x.tanh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> trunc(realpseudovec<real_t, size> x) {
+  return x.trunc();
+}
+
 #ifndef VML_NO_IOSTREAM
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           boolpseudovec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           intpseudovec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           realpseudovec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+                         boolpseudovec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+                         intpseudovec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+                         realpseudovec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
 #endif
-  
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_PSEUDO_H
+#endif // #ifndef VEC_PSEUDO_H
diff --git a/vec_qpx_double4.h b/vec_qpx_double4.h
index 9fa6bd0..b88b0da 100644
--- a/vec_qpx_double4.h
+++ b/vec_qpx_double4.h
@@ -11,785 +11,662 @@
 
 // QPX intrinsics
 #ifdef __clang__
-#  include <qpxintrin.h>
+#include <qpxintrin.h>
 #else
-#  include <builtins.h>
+#include <builtins.h>
 #endif
 #include <mass_simd.h>
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_DOUBLE_4
-  template<> struct boolvec<double,4>;
-  template<> struct intvec<double,4>;
-  template<> struct realvec<double,4>;
-  
-  
-  
-  template<>
-  struct boolvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef vector4double bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // canonical true is +1.0, canonical false is -1.0
-    // >=0 is true, -0 is true, nan is false
-    static real_t from_bool(bool a) { return a ? +1.0 : -1.0; }
-    static bool to_bool(real_t a) { return a>=0.0; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(vec_splats(from_bool(a))) {}
-    boolvec(const bool* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(v[n]);
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return v[n]=from_bool(a), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vec_not(v); }
-    
-    boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
-    boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
-    boolvec operator==(boolvec x) const
-    {
-      return vec_logical(v, x.v, 0x9);
-    }
-    boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
-    
-    bool all() const
-    {
-      // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
-      boolvec x0123 = *this;
-      boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
-      boolvec y0022 = x0123 && x1032;
-      return y0022[0] && y0022[2];
-    }
-    bool any() const
-    {
-      // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
-      boolvec x0123 = *this;
-      boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
-      boolvec y0022 = x0123 || x1032;
-      return y0022[0] || y0022[2];
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef vector4double ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(const intvec& x): v(x.v) {}
-    // intvec& operator=(const intvec& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vec_splats(FP::as_float(a))) {}
-    intvec(const int_t* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota()
-    {
-      const int_t iota_[] = {0, 1, 2, 3};
-      return intvec(iota_);
-    }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return FP::as_int(v[n]);
-    }
-    intvec& set_elt(int n, int_t a)
-    {
-      return v[n]=FP::as_float(a), *this;
-    }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return v; }
-    boolvec_t convert_bool() const { return *this != IV(I(0)); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec operator+() const { return *this; }
-    intvec operator-() const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, -(*this)[d]);
-      return r;
-    }
-    
-    intvec operator+(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] + x[d]);
-      return r;
-    }
-    intvec operator-(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] - x[d]);
-      return r;
-    }
-    
-    intvec& operator+=(intvec x) { return *this=*this+x; }
-    intvec& operator-=(intvec x) { return *this=*this-x; }
-    
-    
-    
-    intvec operator~() const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, ~(*this)[d]);
-      return r;
-    }
-    
-    intvec operator&(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] & x[d]);
-      return r;
-    }
-    intvec operator|(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] | x[d]);
-      return r;
-    }
-    intvec operator^(intvec x) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] ^ x[d]);
-      return r;
-    }
-    
-    intvec& operator&=(intvec x) { return *this=*this&x; }
-    intvec& operator|=(intvec x) { return *this=*this|x; }
-    intvec& operator^=(intvec x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const
-    {
-      intvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, U((*this)[d]) >> U(n));
-      return r;
-    }
-    intvec_t rotate(int_t n) const;
-    intvec operator>>(int_t n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >> n);
-      return r;
-    }
-    intvec operator<<(int_t n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] << n);
-      return r;
-    }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      intvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, U((*this)[d]) >> U(n[d]));
-      return r;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec operator>>(intvec n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >> n[d]);
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      intvec r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] << n[d]);
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] == x[d]);
-      return r;
-    }
-    boolvec_t operator!=(intvec x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] != x[d]);
-      return r;
-    }
-    boolvec_t operator<(intvec x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] < x[d]);
-      return r;
-    }
-    boolvec_t operator<=(intvec x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] <= x[d]);
-      return r;
-    }
-    boolvec_t operator>(intvec x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] > x[d]);
-      return r;
-    }
-    boolvec_t operator>=(intvec x) const
-    {
-      boolvec_t r;
-      for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >= x[d]);
-      return r;
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const;
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,4>: floatprops<double>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef vector4double vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static const char* name() { return "<QPX:4*double>"; }
-    void barrier() { __asm__("": "+v"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(const realvec& x): v(x.v) {}
-    // realvec& operator=(const realvec& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vec_splats(a)) {}
-    realvec(const real_t* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return v[n];
-    }
-    realvec& set_elt(int n, real_t a)
-    {
-      return v[n]=a, *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(const real_t* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vec_lda(0, (real_t*)p);
-    }
-    static realvec_t loadu(const real_t* p)
-    {
-      realvec_t v0 = vec_ld(0, (real_t*)p);
-      realvec_t v1 = vec_ld(31, (real_t*)p);
-      return vec_perm(v0.v, v1.v, vec_lvsl(0, (real_t*)p));
-    }
-    static realvec_t loadu(const real_t* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      // TODO: use load instruction with fixed offset
-      return loadu(p+ioff);
-    }
-    realvec_t loada(const real_t* p, mask_t m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(const real_t* p, mask_t m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(const real_t* p, std::ptrdiff_t ioff, mask_t m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      // TODO: use load instruction with fixed offset
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vec_sta(v, 0, p);
-    }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
-      // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
-      p[2] = (*this)[2];
-      p[3] = (*this)[3];
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, mask_t m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return v; }
-    intvec_t convert_int() const { return vec_ctidz(v); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return vec_neg(v); }
-    
-    realvec operator+(realvec x) const { return vec_add(v, x.v); }
-    realvec operator-(realvec x) const { return vec_sub(v, x.v); }
-    realvec operator*(realvec x) const { return vec_mul(v, x.v); }
-    realvec operator/(realvec x) const
-    {
-      // return vec_swdiv_nochk(v, x.v);
-      return div_fastd4(v, x.v);
-    }
-    
-    realvec& operator+=(realvec x) { return *this=*this+x; }
-    realvec& operator-=(realvec x) { return *this=*this-x; }
-    realvec& operator*=(realvec x) { return *this=*this*x; }
-    realvec& operator/=(realvec x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
-      //                      vml_std::fmax((*this)[2], (*this)[3]));
-      realvec_t x0123 = *this;
-      realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
-      realvec_t y0022 = x0123.fmax(x1032);
-      return vml_std::fmax(y0022[0], y0022[2]);
+template <> struct boolvec<double, 4>;
+template <> struct intvec<double, 4>;
+template <> struct realvec<double, 4>;
+
+template <> struct boolvec<double, 4> : floatprops<double> {
+  static int const size = 4;
+  typedef bool scalar_t;
+  typedef vector4double bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // canonical true is +1.0, canonical false is -1.0
+  // >=0 is true, -0 is true, nan is false
+  static real_t from_bool(bool a) { return a ? +1.0 : -1.0; }
+  static bool to_bool(real_t a) { return a >= 0.0; }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(vec_splats(from_bool(a))) {}
+  boolvec(const bool *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const { return to_bool(v[n]); }
+  boolvec &set_elt(int n, bool a) { return v[n] = from_bool(a), *this; }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return vec_not(v); }
+
+  boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
+  boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
+  boolvec operator==(boolvec x) const { return vec_logical(v, x.v, 0x9); }
+  boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
+
+  bool all() const {
+    // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+    boolvec x0123 = *this;
+    boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
+    boolvec y0022 = x0123 && x1032;
+    return y0022[0] && y0022[2];
+  }
+  bool any() const {
+    // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+    boolvec x0123 = *this;
+    boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
+    boolvec y0022 = x0123 || x1032;
+    return y0022[0] || y0022[2];
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 4> : floatprops<double> {
+  static int const size = 4;
+  typedef int_t scalar_t;
+  typedef vector4double ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(const intvec& x): v(x.v) {}
+  // intvec& operator=(const intvec& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(vec_splats(FP::as_float(a))) {}
+  intvec(const int_t *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+  static intvec iota() {
+    const int_t iota_[] = {0, 1, 2, 3};
+    return intvec(iota_);
+  }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const { return FP::as_int(v[n]); }
+  intvec &set_elt(int n, int_t a) { return v[n] = FP::as_float(a), *this; }
+
+  // Vector casts do not change the bit battern
+  boolvec_t as_bool() const { return v; }
+  boolvec_t convert_bool() const { return *this != IV(I(0)); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  intvec operator+() const { return *this; }
+  intvec operator-() const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, -(*this)[d]);
+    return r;
+  }
+
+  intvec operator+(intvec x) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] + x[d]);
+    return r;
+  }
+  intvec operator-(intvec x) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] - x[d]);
+    return r;
+  }
+
+  intvec &operator+=(intvec x) { return *this = *this + x; }
+  intvec &operator-=(intvec x) { return *this = *this - x; }
+
+  intvec operator~() const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, ~(*this)[d]);
+    return r;
+  }
+
+  intvec operator&(intvec x) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] & x[d]);
+    return r;
+  }
+  intvec operator|(intvec x) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] | x[d]);
+    return r;
+  }
+  intvec operator^(intvec x) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] ^ x[d]);
+    return r;
+  }
+
+  intvec &operator&=(intvec x) { return *this = *this & x; }
+  intvec &operator|=(intvec x) { return *this = *this | x; }
+  intvec &operator^=(intvec x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const {
+    intvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, U((*this)[d]) >> U(n));
+    return r;
+  }
+  intvec_t rotate(int_t n) const;
+  intvec operator>>(int_t n) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] >> n);
+    return r;
+  }
+  intvec operator<<(int_t n) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] << n);
+    return r;
+  }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+    intvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, U((*this)[d]) >> U(n[d]));
+    return r;
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec operator>>(intvec n) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] >> n[d]);
+    return r;
+  }
+  intvec operator<<(intvec n) const {
+    intvec r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] << n[d]);
+    return r;
+  }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec x) const {
+    boolvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] == x[d]);
+    return r;
+  }
+  boolvec_t operator!=(intvec x) const {
+    boolvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] != x[d]);
+    return r;
+  }
+  boolvec_t operator<(intvec x) const {
+    boolvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] < x[d]);
+    return r;
+  }
+  boolvec_t operator<=(intvec x) const {
+    boolvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] <= x[d]);
+    return r;
+  }
+  boolvec_t operator>(intvec x) const {
+    boolvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] > x[d]);
+    return r;
+  }
+  boolvec_t operator>=(intvec x) const {
+    boolvec_t r;
+    for (int d = 0; d < size; ++d)
+      r.set_elt(d, (*this)[d] >= x[d]);
+    return r;
+  }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const;
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 4> : floatprops<double> {
+  static int const size = 4;
+  typedef real_t scalar_t;
+  typedef vector4double vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static const char *name() { return "<QPX:4*double>"; }
+  void barrier() { __asm__("" : "+v"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(const realvec& x): v(x.v) {}
+  // realvec& operator=(const realvec& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(vec_splats(a)) {}
+  realvec(const real_t *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const { return v[n]; }
+  realvec &set_elt(int n, real_t a) { return v[n] = a, *this; }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(const real_t *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return vec_lda(0, (real_t *)p);
+  }
+  static realvec_t loadu(const real_t *p) {
+    realvec_t v0 = vec_ld(0, (real_t *)p);
+    realvec_t v1 = vec_ld(31, (real_t *)p);
+    return vec_perm(v0.v, v1.v, vec_lvsl(0, (real_t *)p));
+  }
+  static realvec_t loadu(const real_t *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    // TODO: use load instruction with fixed offset
+    return loadu(p + ioff);
+  }
+  realvec_t loada(const real_t *p, mask_t m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    real_t minval() const
-    {
-      // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
-      //                      vml_std::fmin((*this)[2], (*this)[3]));
-      realvec_t x0123 = *this;
-      realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
-      realvec_t y0022 = x0123.fmin(x1032);
-      return vml_std::fmin(y0022[0], y0022[2]);
+  }
+  realvec_t loadu(const real_t *p, mask_t m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    real_t prod() const
-    {
-      // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-      realvec_t x = vec_xmul(v, v);
-      return x[1] * x[3];
+  }
+  realvec_t loadu(const real_t *p, std::ptrdiff_t ioff, mask_t m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    // TODO: use load instruction with fixed offset
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    vec_sta(v, 0, p);
+  }
+  void storeu(real_t *p) const {
+    // Vector stores would require vector loads, which would need to
+    // be atomic
+    // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html>
+    // for good ideas
+    p[0] = (*this)[0];
+    p[1] = (*this)[1];
+    p[2] = (*this)[2];
+    p[3] = (*this)[3];
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-    real_t sum() const
-    {
-      // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-      realvec_t c1 = vec_logical(v, v, 0xf); // +1.0
-      realvec_t x = vec_xxmadd(v, c1, v);
-      return x[0] + x[2];
+  }
+  void storeu(real_t *p, mask_t m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-    
-    
-    
-    boolvec_t operator==(realvec x) const { return vec_cmpeq(v, x.v); }
-    boolvec_t operator!=(realvec x) const { return ! (*this == x); }
-    boolvec_t operator<(realvec x) const { return vec_cmplt(v, x.v); }
-    boolvec_t operator<=(realvec x) const
-    {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return v; }
+  intvec_t convert_int() const { return vec_ctidz(v); }
+
+  realvec operator+() const { return *this; }
+  realvec operator-() const { return vec_neg(v); }
+
+  realvec operator+(realvec x) const { return vec_add(v, x.v); }
+  realvec operator-(realvec x) const { return vec_sub(v, x.v); }
+  realvec operator*(realvec x) const { return vec_mul(v, x.v); }
+  realvec operator/(realvec x) const {
+    // return vec_swdiv_nochk(v, x.v);
+    return div_fastd4(v, x.v);
+  }
+
+  realvec &operator+=(realvec x) { return *this = *this + x; }
+  realvec &operator-=(realvec x) { return *this = *this - x; }
+  realvec &operator*=(realvec x) { return *this = *this * x; }
+  realvec &operator/=(realvec x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+    //                      vml_std::fmax((*this)[2], (*this)[3]));
+    realvec_t x0123 = *this;
+    realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
+    realvec_t y0022 = x0123.fmax(x1032);
+    return vml_std::fmax(y0022[0], y0022[2]);
+  }
+  real_t minval() const {
+    // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+    //                      vml_std::fmin((*this)[2], (*this)[3]));
+    realvec_t x0123 = *this;
+    realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
+    realvec_t y0022 = x0123.fmin(x1032);
+    return vml_std::fmin(y0022[0], y0022[2]);
+  }
+  real_t prod() const {
+    // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+    realvec_t x = vec_xmul(v, v);
+    return x[1] * x[3];
+  }
+  real_t sum() const {
+    // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+    realvec_t c1 = vec_logical(v, v, 0xf); // +1.0
+    realvec_t x = vec_xxmadd(v, c1, v);
+    return x[0] + x[2];
+  }
+
+  boolvec_t operator==(realvec x) const { return vec_cmpeq(v, x.v); }
+  boolvec_t operator!=(realvec x) const { return !(*this == x); }
+  boolvec_t operator<(realvec x) const { return vec_cmplt(v, x.v); }
+  boolvec_t operator<=(realvec x) const {
 #ifdef VML_HAVE_NAN
-      return *this < x || *this == x;
+    return *this < x || *this == x;
 #else
-      return ! (*this > x);
+    return !(*this > x);
 #endif
-    }
-    boolvec_t operator>(realvec x) const { return vec_cmpgt(v, x.v); }
-    boolvec_t operator>=(realvec x) const
-    {
+  }
+  boolvec_t operator>(realvec x) const { return vec_cmpgt(v, x.v); }
+  boolvec_t operator>=(realvec x) const {
 #ifdef VML_HAVE_NAN
-      return *this > x || *this == x;
+    return *this > x || *this == x;
 #else
-      return ! (*this < x);
+    return !(*this < x);
 #endif
-    }
-    
-    
-    
-    realvec acos() const { return acosd4(v); }
-    realvec acosh() const { return acoshd4(v); }
-    realvec asin() const { return asind4(v); }
-    realvec asinh() const { return asinhd4(v); }
-    realvec atan() const { return atand4(v); }
-    realvec atan2(realvec y) const { return atan2d4(v, y.v); }
-    realvec atanh() const { return atanhd4(v); }
-    realvec cbrt() const { return cbrtd4(v); }
-    realvec ceil() const { return vec_ceil(v); }
-    realvec copysign(realvec y) const { return vec_cpsgn(y.v, v); }
-    realvec cos() const { return cosd4(v); }
-    realvec cosh() const { return coshd4(v); }
-    realvec exp() const { return expd4(v); }
-    realvec exp10() const { return exp10d4(v); }
-    realvec exp2() const { return exp2d4(v); }
-    realvec expm1() const { return expm1d4(v); }
-    realvec fabs() const { return vec_abs(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return vec_floor(v); }
-    realvec fma(realvec y, realvec z) const
-    {
-      return vec_madd(v, y.v, z.v);
-    }
-    realvec fmax(realvec y) const { return MF::vml_fmax(v, y.v); }
-    realvec fmin(realvec y) const { return MF::vml_fmin(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec hypot(realvec y) const { return hypotd4(v, y.v); }
-    intvec_t ilogb() const
-    {
-      // int_t ilogb_[] = {
-      //   ::ilogb((*this)[0]),
-      //   ::ilogb((*this)[1]),
-      //   ::ilogb((*this)[2]),
-      //   ::ilogb((*this)[3])
-      // };
-      // return intvec_t(ilogb_);
-      return MF::vml_ilogb(v);
-    }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const
-    {
+  }
+
+  realvec acos() const { return acosd4(v); }
+  realvec acosh() const { return acoshd4(v); }
+  realvec asin() const { return asind4(v); }
+  realvec asinh() const { return asinhd4(v); }
+  realvec atan() const { return atand4(v); }
+  realvec atan2(realvec y) const { return atan2d4(v, y.v); }
+  realvec atanh() const { return atanhd4(v); }
+  realvec cbrt() const { return cbrtd4(v); }
+  realvec ceil() const { return vec_ceil(v); }
+  realvec copysign(realvec y) const { return vec_cpsgn(y.v, v); }
+  realvec cos() const { return cosd4(v); }
+  realvec cosh() const { return coshd4(v); }
+  realvec exp() const { return expd4(v); }
+  realvec exp10() const { return exp10d4(v); }
+  realvec exp2() const { return exp2d4(v); }
+  realvec expm1() const { return expm1d4(v); }
+  realvec fabs() const { return vec_abs(v); }
+  realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+  realvec floor() const { return vec_floor(v); }
+  realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
+  realvec fmax(realvec y) const { return MF::vml_fmax(v, y.v); }
+  realvec fmin(realvec y) const { return MF::vml_fmin(v, y.v); }
+  realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+  realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec hypot(realvec y) const { return hypotd4(v, y.v); }
+  intvec_t ilogb() const {
+    // int_t ilogb_[] = {
+    //   ::ilogb((*this)[0]),
+    //   ::ilogb((*this)[1]),
+    //   ::ilogb((*this)[2]),
+    //   ::ilogb((*this)[3])
+    // };
+    // return intvec_t(ilogb_);
+    return MF::vml_ilogb(v);
+  }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const {
 #ifdef VML_HAVE_NAN
-      return vec_tstnan(v, v);
+    return vec_tstnan(v, v);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return ldexp(intvec_t(n)); }
-    realvec ldexp(intvec_t n) const
-    {
-      real_t ldexp_[] = {
-        vml_std::ldexp((*this)[0], n[0]),
-        vml_std::ldexp((*this)[1], n[1]),
-        vml_std::ldexp((*this)[2], n[2]),
-        vml_std::ldexp((*this)[3], n[3])
-      };
-      return realvec_t(ldexp_);
-    }
-    realvec log() const { return logd4(v); }
-    realvec log10() const { return log10d4(v); }
-    realvec log1p() const { return log1pd4(v); }
-    realvec log2() const { return log2d4(v); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec nextafter(realvec y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec pow(realvec y) const { return powd4(v, y.v); }
-    realvec rcp() const { return recip_fastd4(v); }
-    realvec remainder(realvec y) const
-    {
-      return MF::vml_remainder(*this, y);
-    }
-    realvec rint() const
-    {
-      return MF::vml_rint(*this);
-      // This is tempting, but seems too invasive
-      // #ifdef VML_HAVE_FP_CONTRACT
-      //       return MF::vml_rint(*this);
-      // #else
-      //       return vec_round(v);      // use round instead of rint
-      // #endif
-    }
-    realvec round() const { return vec_round(v); }
-    realvec rsqrt() const
-    {
-      realvec x = *this;
-      realvec r = vec_rsqrte(x.v); // this is only an approximation
-      // TODO: use fma
-      // two Newton iterations (see vml_rsqrt)
-      r += RV(0.5)*r * (RV(1.0) - x * r*r);
-      r += RV(0.5)*r * (RV(1.0) - x * r*r);
-      return r;
-    }
-    boolvec_t signbit() const
-    {
-      return !RV(1.0).copysign(*this).as_int().as_bool();
-    }
-    realvec sin() const { return sind4(v); }
-    realvec sinh() const { return sinhd4(v); }
-    realvec sqrt() const
-    {
-      // return vec_sqrtsw_nochk(v);
-      return *this * rsqrt();
-    }
-    realvec tan() const { return tand4(v); }
-    realvec tanh() const { return tanhd4(v); }
-    realvec trunc() const { return vec_trunc(v); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<double,4> boolvec<double,4>::as_int() const
-  {
-    return v;
-  }
-  
-  inline intvec<double,4> boolvec<double,4>::convert_int() const
-  {
-    return ifthen(IV(I(1)), IV(I(0)));
-  }
-  
-  inline
-  boolvec<double,4>
-  boolvec<double,4>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline
-  intvec<double,4>
-  boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  realvec<double,4>
-  boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline intvec<double,4> intvec<double,4>::abs() const
-  {
-    return MF::vml_abs(*this);
-  }
-  
-  inline realvec<double,4> intvec<double,4>::as_float() const
-  {
-    return v;
-  }
-  
-  inline intvec<double,4> intvec<double,4>::bitifthen(intvec_t x,
-                                                      intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline realvec<double,4> intvec<double,4>::convert_float() const
-  {
-    return vec_cfid(v);
-  }
-  
-  inline boolvec<double,4> intvec<double,4>::isignbit() const
-  {
-    return MF::vml_isignbit(*this);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<double,4> intvec<double,4>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+  }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return ldexp(intvec_t(n)); }
+  realvec ldexp(intvec_t n) const {
+    real_t ldexp_[] = {
+        vml_std::ldexp((*this)[0], n[0]), vml_std::ldexp((*this)[1], n[1]),
+        vml_std::ldexp((*this)[2], n[2]), vml_std::ldexp((*this)[3], n[3])};
+    return realvec_t(ldexp_);
+  }
+  realvec log() const { return logd4(v); }
+  realvec log10() const { return log10d4(v); }
+  realvec log1p() const { return log1pd4(v); }
+  realvec log2() const { return log2d4(v); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+  realvec pow(realvec y) const { return powd4(v, y.v); }
+  realvec rcp() const { return recip_fastd4(v); }
+  realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+  realvec rint() const {
+    return MF::vml_rint(*this);
+    // This is tempting, but seems too invasive
+    // #ifdef VML_HAVE_FP_CONTRACT
+    //       return MF::vml_rint(*this);
+    // #else
+    //       return vec_round(v);      // use round instead of rint
+    // #endif
+  }
+  realvec round() const { return vec_round(v); }
+  realvec rsqrt() const {
+    realvec x = *this;
+    realvec r = vec_rsqrte(x.v); // this is only an approximation
+    // TODO: use fma
+    // two Newton iterations (see vml_rsqrt)
+    r += RV(0.5) * r * (RV(1.0) - x * r * r);
+    r += RV(0.5) * r * (RV(1.0) - x * r * r);
+    return r;
+  }
+  boolvec_t signbit() const {
+    return !RV(1.0).copysign(*this).as_int().as_bool();
+  }
+  realvec sin() const { return sind4(v); }
+  realvec sinh() const { return sinhd4(v); }
+  realvec sqrt() const {
+    // return vec_sqrtsw_nochk(v);
+    return *this * rsqrt();
+  }
+  realvec tan() const { return tand4(v); }
+  realvec tanh() const { return tanhd4(v); }
+  realvec trunc() const { return vec_trunc(v); }
+};
+
+// boolvec definitions
+
+inline intvec<double, 4> boolvec<double, 4>::as_int() const { return v; }
+
+inline intvec<double, 4> boolvec<double, 4>::convert_int() const {
+  return ifthen(IV(I(1)), IV(I(0)));
+}
+
+inline boolvec<double, 4> boolvec<double, 4>::ifthen(boolvec_t x,
+                                                     boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<double, 4> boolvec<double, 4>::ifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<double, 4> boolvec<double, 4>::ifthen(realvec_t x,
+                                                     realvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline intvec<double, 4> intvec<double, 4>::abs() const {
+  return MF::vml_abs(*this);
+}
+
+inline realvec<double, 4> intvec<double, 4>::as_float() const { return v; }
+
+inline intvec<double, 4> intvec<double, 4>::bitifthen(intvec_t x,
+                                                      intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 4> intvec<double, 4>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline realvec<double, 4> intvec<double, 4>::convert_float() const {
+  return vec_cfid(v);
+}
+
+inline boolvec<double, 4> intvec<double, 4>::isignbit() const {
+  return MF::vml_isignbit(*this);
+}
+
+inline intvec<double, 4> intvec<double, 4>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 4> intvec<double, 4>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 4> intvec<double, 4>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<double, 4> intvec<double, 4>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 4> intvec<double, 4>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_QPX_DOUBLE4_H
+#endif // #ifndef VEC_QPX_DOUBLE4_H
diff --git a/vec_sse_double1.h b/vec_sse_double1.h
index 5558356..d727de8 100644
--- a/vec_sse_double1.h
+++ b/vec_sse_double1.h
@@ -12,589 +12,493 @@
 
 // SSE2 intrinsics
 #include <emmintrin.h>
-#ifdef __SSE3__                 // Intel's SSE 3
-#  include <pmmintrin.h>
+#ifdef __SSE3__ // Intel's SSE 3
+#include <pmmintrin.h>
 #endif
-#ifdef __SSE4_1__               // Intel's SSE 4.1
-#  include <smmintrin.h>
+#ifdef __SSE4_1__ // Intel's SSE 4.1
+#include <smmintrin.h>
 #endif
-#ifdef __SSE4A__                // AMD's SSE 4a
-#  include <ammintrin.h>
+#ifdef __SSE4A__ // AMD's SSE 4a
+#include <ammintrin.h>
 #endif
-#if defined __AVX__             // Intel's AVX
-#  include <immintrin.h>
+#if defined __AVX__ // Intel's AVX
+#include <immintrin.h>
 #endif
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_DOUBLE_1
-  template<> struct boolvec<double,1>;
-  template<> struct intvec<double,1>;
-  template<> struct realvec<double,1>;
-  
-  
-  
-  template<>
-  struct boolvec<double,1>: floatprops<double>
-  {
-    static int const size = 1;
-    typedef bool scalar_t;
-    typedef uint_t bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-    // true values are non-zero, false values are zero
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(a) {}
-    boolvec(bool const* as): v(as[0]) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return v; }
-    boolvec_t& set_elt(int n, bool a) { return v=a, *this; }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec_t operator!() const { return !v; }
-    
-    boolvec_t operator&&(boolvec_t x) const { return v && x.v; }
-    boolvec_t operator||(boolvec_t x) const { return v || x.v; }
-    boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); }
-    boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); }
-    
-    bool all() const { return *this; }
-    bool any() const { return *this; }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,1>: floatprops<double>
-  {
-    static int const size = 1;
-    typedef int_t scalar_t;
-    typedef int_t ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(int_t a): v(a) {}
-    intvec(int_t const* as): v(as[0]) {}
-    static intvec_t iota() { return intvec(I(0)); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return v; }
-    intvec_t& set_elt(int n, int_t a) { return v=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return U(v); }
-    boolvec_t convert_bool() const { return bool(v); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec_t operator+() const { return +v; }
-    intvec_t operator-() const { return -v; }
-    
-    intvec_t operator+(intvec_t x) const { return v+x.v; }
-    intvec_t operator-(intvec_t x) const { return v-x.v; }
-    intvec_t operator*(intvec_t x) const { return v*x.v; }
-    intvec_t operator/(intvec_t x) const { return v/x.v; }
-    intvec_t operator%(intvec_t x) const { return v%x.v; }
-    
-    intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
-    intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-    intvec_t& operator*=(intvec_t const& x) { return *this=*this*x; }
-    intvec_t& operator/=(intvec_t const& x) { return *this=*this/x; }
-    intvec_t& operator%=(intvec_t const& x) { return *this=*this%x; }
-    
-    
-    
-    intvec_t operator~() const { return ~v; }
-    
-    intvec_t operator&(intvec_t x) const { return v&x.v; }
-    intvec_t operator|(intvec_t x) const { return v|x.v; }
-    intvec_t operator^(intvec_t x) const { return v^x.v; }
-    
-    intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
-    intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
-    intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const { return U(v) >> U(n); }
-    intvec_t rotate(int_t n) const;
-    intvec_t operator>>(int_t n) const { return v>>n; }
-    intvec_t operator<<(int_t n) const { return v<<n; }
-    
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const { return U(v) >> U(n); }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t operator>>(intvec_t n) const { return v>>n; }
-    intvec_t operator<<(intvec_t n) const { return v<<n; }
-    
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const { return __builtin_clzll(v); }
-    intvec_t popcount() const { return __builtin_popcountll(v); }
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const { return v==x.v; }
-    boolvec_t operator!=(intvec_t const& x) const { return v!=x.v; }
-    boolvec_t operator<(intvec_t const& x) const { return v<x.v; }
-    boolvec_t operator<=(intvec_t const& x) const { return v<=x.v; }
-    boolvec_t operator>(intvec_t const& x) const { return v>x.v; }
-    boolvec_t operator>=(intvec_t const& x) const { return v>=x.v; }
-    
-    intvec_t abs() const { return std::abs(v); }
-    boolvec_t isignbit() const { return v<0; }
-    intvec_t max(intvec_t x) const { return std::max(v, x.v); }
-    intvec_t min(intvec_t x) const { return std::min(v, x.v); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,1>: floatprops<double>
-  {
-    static int const size = 1;
-    typedef real_t scalar_t;
-    typedef double vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<SSE2:1*double>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-  private:
-    static __m128d from_double(double a) { return _mm_set_sd(a); }
-    static double to_double(__m128d a) { return _mm_cvtsd_f64(a); }
-  public:
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(real_t a): v(a) {}
-    realvec(real_t const* as): v(as[0]) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return v; }
-    realvec_t& set_elt(int n, real_t a) { return v=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return *p;
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return *p;
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loada(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return *this;
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return *this;
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loada(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      *p = v;
-    }
-    void storeu(real_t* p) const
-    {
-      *p = v;
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storea(p+ioff);
+template <> struct boolvec<double, 1>;
+template <> struct intvec<double, 1>;
+template <> struct realvec<double, 1>;
+
+template <> struct boolvec<double, 1> : floatprops<double> {
+  static int const size = 1;
+  typedef bool scalar_t;
+  typedef uint_t bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+  // true values are non-zero, false values are zero
+
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(a) {}
+  boolvec(bool const *as) : v(as[0]) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const { return v; }
+  boolvec_t &set_elt(int n, bool a) { return v = a, *this; }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec_t operator!() const { return !v; }
+
+  boolvec_t operator&&(boolvec_t x) const { return v && x.v; }
+  boolvec_t operator||(boolvec_t x) const { return v || x.v; }
+  boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); }
+  boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); }
+
+  bool all() const { return *this; }
+  bool any() const { return *this; }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 1> : floatprops<double> {
+  static int const size = 1;
+  typedef int_t scalar_t;
+  typedef int_t ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(int_t a) : v(a) {}
+  intvec(int_t const *as) : v(as[0]) {}
+  static intvec_t iota() { return intvec(I(0)); }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const { return v; }
+  intvec_t &set_elt(int n, int_t a) { return v = a, *this; }
+
+  boolvec_t as_bool() const { return U(v); }
+  boolvec_t convert_bool() const { return bool(v); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  intvec_t operator+() const { return +v; }
+  intvec_t operator-() const { return -v; }
+
+  intvec_t operator+(intvec_t x) const { return v + x.v; }
+  intvec_t operator-(intvec_t x) const { return v - x.v; }
+  intvec_t operator*(intvec_t x) const { return v * x.v; }
+  intvec_t operator/(intvec_t x) const { return v / x.v; }
+  intvec_t operator%(intvec_t x) const { return v % x.v; }
+
+  intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+  intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+  intvec_t &operator*=(intvec_t const &x) { return *this = *this * x; }
+  intvec_t &operator/=(intvec_t const &x) { return *this = *this / x; }
+  intvec_t &operator%=(intvec_t const &x) { return *this = *this % x; }
+
+  intvec_t operator~() const { return ~v; }
+
+  intvec_t operator&(intvec_t x) const { return v & x.v; }
+  intvec_t operator|(intvec_t x) const { return v | x.v; }
+  intvec_t operator^(intvec_t x) const { return v ^ x.v; }
+
+  intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+  intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+  intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const { return U(v) >> U(n); }
+  intvec_t rotate(int_t n) const;
+  intvec_t operator>>(int_t n) const { return v >> n; }
+  intvec_t operator<<(int_t n) const { return v << n; }
+
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const { return U(v) >> U(n); }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t operator>>(intvec_t n) const { return v >> n; }
+  intvec_t operator<<(intvec_t n) const { return v << n; }
+
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const { return __builtin_clzll(v); }
+  intvec_t popcount() const { return __builtin_popcountll(v); }
+
+  boolvec_t operator==(intvec_t const &x) const { return v == x.v; }
+  boolvec_t operator!=(intvec_t const &x) const { return v != x.v; }
+  boolvec_t operator<(intvec_t const &x) const { return v < x.v; }
+  boolvec_t operator<=(intvec_t const &x) const { return v <= x.v; }
+  boolvec_t operator>(intvec_t const &x) const { return v > x.v; }
+  boolvec_t operator>=(intvec_t const &x) const { return v >= x.v; }
+
+  intvec_t abs() const { return std::abs(v); }
+  boolvec_t isignbit() const { return v < 0; }
+  intvec_t max(intvec_t x) const { return std::max(v, x.v); }
+  intvec_t min(intvec_t x) const { return std::min(v, x.v); }
+};
+
+template <> struct realvec<double, 1> : floatprops<double> {
+  static int const size = 1;
+  typedef real_t scalar_t;
+  typedef double vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<SSE2:1*double>"; }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+private:
+  static __m128d from_double(double a) { return _mm_set_sd(a); }
+  static double to_double(__m128d a) { return _mm_cvtsd_f64(a); }
+
+public:
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(real_t a) : v(a) {}
+  realvec(real_t const *as) : v(as[0]) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const { return v; }
+  realvec_t &set_elt(int n, real_t a) { return v = a, *this; }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return *p;
+  }
+  static realvec_t loadu(real_t const *p) { return *p; }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loada(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return *this;
     }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      }
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return *this;
     }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      }
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loada(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    *p = v;
+  }
+  void storeu(real_t *p) const { *p = v; }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storea(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storea(p+ioff, m);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
     }
-    
-    
-    
-    intvec_t as_int() const { return floatprops::as_int(v); }
-    intvec_t convert_int() const {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storea(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return floatprops::as_int(v); }
+  intvec_t convert_int() const {
 #ifdef __x86_64__
-      return _mm_cvttsd_si64(_mm_set_sd(v));
+    return _mm_cvttsd_si64(_mm_set_sd(v));
 #else
-      return floatprops::convert_int(v);
+    return floatprops::convert_int(v);
 #endif
-    }
-    
-    
-    
-    realvec_t operator+() const { return +v; }
-    realvec_t operator-() const { return -v; }
-    
-    realvec_t operator+(realvec_t x) const { return v+x.v; }
-    realvec_t operator-(realvec_t x) const { return v-x.v; }
-    realvec_t operator*(realvec_t x) const { return v*x.v; }
-    realvec_t operator/(realvec_t x) const { return v/x.v; }
-    
-    realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
-    realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
-    realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
-    realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-    
-    real_t maxval() const { return *this; }
-    real_t minval() const { return *this; }
-    real_t prod() const { return *this; }
-    real_t sum() const { return *this; }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const { return v==x.v; }
-    boolvec_t operator!=(realvec_t const& x) const { return v!=x.v; }
-    boolvec_t operator<(realvec_t const& x) const { return v<x.v; }
-    boolvec_t operator<=(realvec_t const& x) const { return v<=x.v; }
-    boolvec_t operator>(realvec_t const& x) const { return v>x.v; }
-    boolvec_t operator>=(realvec_t const& x) const { return v>=x.v; }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const
-    {
+  }
+
+  realvec_t operator+() const { return +v; }
+  realvec_t operator-() const { return -v; }
+
+  realvec_t operator+(realvec_t x) const { return v + x.v; }
+  realvec_t operator-(realvec_t x) const { return v - x.v; }
+  realvec_t operator*(realvec_t x) const { return v * x.v; }
+  realvec_t operator/(realvec_t x) const { return v / x.v; }
+
+  realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+  realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+  realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+  realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+  real_t maxval() const { return *this; }
+  real_t minval() const { return *this; }
+  real_t prod() const { return *this; }
+  real_t sum() const { return *this; }
+
+  boolvec_t operator==(realvec_t const &x) const { return v == x.v; }
+  boolvec_t operator!=(realvec_t const &x) const { return v != x.v; }
+  boolvec_t operator<(realvec_t const &x) const { return v < x.v; }
+  boolvec_t operator<=(realvec_t const &x) const { return v <= x.v; }
+  boolvec_t operator>(realvec_t const &x) const { return v > x.v; }
+  boolvec_t operator>=(realvec_t const &x) const { return v >= x.v; }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const {
 #ifdef __SSE4_1__
-      return to_double(_mm_ceil_sd(from_double(v), from_double(v)));
+    return to_double(_mm_ceil_sd(from_double(v), from_double(v)));
 #else
-      return vml_std::ceil(v);
+    return vml_std::ceil(v);
 #endif
-    }
-    realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return vml_std::fabs(v); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const
-    {
+  }
+  realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return vml_std::fabs(v); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const {
 #ifdef __SSE4_1__
-      return to_double(_mm_floor_sd(from_double(v), from_double(v)));
+    return to_double(_mm_floor_sd(from_double(v), from_double(v)));
 #else
-      return vml_std::floor(v);
+    return vml_std::floor(v);
 #endif
-    }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
-    }
-    realvec_t fmax(realvec_t y) const
-    {
-      return to_double(_mm_max_sd(from_double(v), from_double(y.v)));
-    }
-    realvec_t fmin(realvec_t y) const
-    {
-      return to_double(_mm_min_sd(from_double(v), from_double(y.v)));
-    }
-    realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
-    realvec_t frexp(intvec_t* irp) const
-    {
-      int iri;
-      realvec_t r = vml_std::frexp(v, &iri);
-      int_t ir = iri;
-      if (isinf()) ir = std::numeric_limits<int_t>::max();
-      if (isnan()) ir = std::numeric_limits<int_t>::min();
-      irp->v = ir;
-      return r;
-    }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const
-    {
-      int_t r = vml_std::ilogb(v);
-      typedef std::numeric_limits<int_t> NL;
-      if (FP_ILOGB0 != NL::min() and v == R(0.0)) {
-        r = NL::min();
+  }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const {
+    return to_double(_mm_max_sd(from_double(v), from_double(y.v)));
+  }
+  realvec_t fmin(realvec_t y) const {
+    return to_double(_mm_min_sd(from_double(v), from_double(y.v)));
+  }
+  realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
+  realvec_t frexp(intvec_t *irp) const {
+    int iri;
+    realvec_t r = vml_std::frexp(v, &iri);
+    int_t ir = iri;
+    if (isinf())
+      ir = std::numeric_limits<int_t>::max();
+    if (isnan())
+      ir = std::numeric_limits<int_t>::min();
+    irp->v = ir;
+    return r;
+  }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const {
+    int_t r = vml_std::ilogb(v);
+    typedef std::numeric_limits<int_t> NL;
+    if (FP_ILOGB0 != NL::min() and v == R(0.0)) {
+      r = NL::min();
 #if defined VML_HAVE_INF
-      } else if (INT_MAX != NL::max() and vml_std::isinf(v)) {
-        r = NL::max();
+    } else if (INT_MAX != NL::max() and vml_std::isinf(v)) {
+      r = NL::max();
 #endif
 #if defined VML_HAVE_NAN
-      } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v)) {
-        r = NL::min();
+    } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v)) {
+      r = NL::min();
 #endif
-      }
-      return r;
-    }
-    boolvec_t isfinite() const { return vml_std::isfinite(v); }
-    boolvec_t isinf() const { return vml_std::isinf(v); }
-    boolvec_t isnan() const
-    {
-      // This is wrong:
-      // return _mm_ucomineq_sd(from_double(v), from_double(v));
-      // This works:
-      // char r;
-      // __asm__("ucomisd %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v));
-      // return boolvec_t::scalar_t(r);
-      // This works as well:
-      return vml_std::isnan(v);
-    }
-    boolvec_t isnormal() const { return vml_std::isnormal(v); }
-    realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); }
-    realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const { return R(1.0)/v; }
-    realvec_t remainder(realvec_t y) const
-    {
-      return vml_std::remainder(v, y.v);
     }
-    realvec_t rint() const
-    {
+    return r;
+  }
+  boolvec_t isfinite() const { return vml_std::isfinite(v); }
+  boolvec_t isinf() const { return vml_std::isinf(v); }
+  boolvec_t isnan() const {
+    // This is wrong:
+    // return _mm_ucomineq_sd(from_double(v), from_double(v));
+    // This works:
+    // char r;
+    // __asm__("ucomisd %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v));
+    // return boolvec_t::scalar_t(r);
+    // This works as well:
+    return vml_std::isnan(v);
+  }
+  boolvec_t isnormal() const { return vml_std::isnormal(v); }
+  realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); }
+  realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const { return R(1.0) / v; }
+  realvec_t remainder(realvec_t y) const { return vml_std::remainder(v, y.v); }
+  realvec_t rint() const {
 #ifdef __SSE4_1__
-      return to_double(_mm_round_sd(from_double(v), from_double(v),
-                                    _MM_FROUND_TO_NEAREST_INT));
+    return to_double(_mm_round_sd(from_double(v), from_double(v),
+                                  _MM_FROUND_TO_NEAREST_INT));
 #else
-      return MF::vml_rint(*this);
+    return MF::vml_rint(*this);
 #endif
-    }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return vml_std::signbit(v); }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const
-    {
-      return to_double(_mm_sqrt_sd(from_double(v), from_double(v)));
-    }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const
-    {
+  }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+  boolvec_t signbit() const { return vml_std::signbit(v); }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const {
+    return to_double(_mm_sqrt_sd(from_double(v), from_double(v)));
+  }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const {
 #ifdef __SSE4_1__
-      return to_double(_mm_round_sd(from_double(v), from_double(v),
-                                    _MM_FROUND_TO_ZERO));
+    return to_double(
+        _mm_round_sd(from_double(v), from_double(v), _MM_FROUND_TO_ZERO));
 #else
-      return MF::vml_trunc(*this);
+    return MF::vml_trunc(*this);
 #endif
-    }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<double,1> boolvec<double,1>::as_int() const
-  {
-    return I(v);
   }
-  
-  inline intvec<double,1> boolvec<double,1>::convert_int() const
-  {
-    return v;
-  }
-  
-  inline
-  boolvec<double,1> boolvec<double,1>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return v ? x : y;
-  }
-  
-  inline
-  intvec<double,1> boolvec<double,1>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return v ? x : y;
-  }
-  
-  inline
-  realvec<double,1> boolvec<double,1>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return v ? x : y;
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<double,1> intvec<double,1>::as_float() const
-  {
-    return FP::as_float(v);
-  }
-  
-  inline realvec<double,1> intvec<double,1>::convert_float() const
-  {
+};
+
+// boolvec definitions
+
+inline intvec<double, 1> boolvec<double, 1>::as_int() const { return I(v); }
+
+inline intvec<double, 1> boolvec<double, 1>::convert_int() const { return v; }
+
+inline boolvec<double, 1> boolvec<double, 1>::ifthen(boolvec_t x,
+                                                     boolvec_t y) const {
+  return v ? x : y;
+}
+
+inline intvec<double, 1> boolvec<double, 1>::ifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return v ? x : y;
+}
+
+inline realvec<double, 1> boolvec<double, 1>::ifthen(realvec_t x,
+                                                     realvec_t y) const {
+  return v ? x : y;
+}
+
+// intvec definitions
+
+inline realvec<double, 1> intvec<double, 1>::as_float() const {
+  return FP::as_float(v);
+}
+
+inline realvec<double, 1> intvec<double, 1>::convert_float() const {
 #ifdef __x86_64__
-    return _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(), v));
+  return _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(), v));
 #else
-    return FP::convert_float(v);
+  return FP::convert_float(v);
 #endif
-  }
-  
-  inline intvec<double,1> intvec<double,1>::bitifthen(intvec_t x,
-                                                      intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<double,1> intvec<double,1>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<double,1> intvec<double,1>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+}
+
+inline intvec<double, 1> intvec<double, 1>::bitifthen(intvec_t x,
+                                                      intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 1> intvec<double, 1>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 1> intvec<double, 1>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_SSE_DOUBLE1_H
+#endif // #ifndef VEC_SSE_DOUBLE1_H
diff --git a/vec_sse_double2.h b/vec_sse_double2.h
index 5d64688..095f458 100644
--- a/vec_sse_double2.h
+++ b/vec_sse_double2.h
@@ -11,737 +11,600 @@
 
 // SSE2 intrinsics
 #include <emmintrin.h>
-#ifdef __SSE3__                 // Intel's SSE 3
-#  include <pmmintrin.h>
+#ifdef __SSE3__ // Intel's SSE 3
+#include <pmmintrin.h>
 #endif
-#ifdef __SSE4_1__               // Intel's SSE 4.1
-#  include <smmintrin.h>
+#ifdef __SSE4_1__ // Intel's SSE 4.1
+#include <smmintrin.h>
 #endif
-#ifdef __SSE4A__                // AMD's SSE 4a
-#  include <ammintrin.h>
+#ifdef __SSE4A__ // AMD's SSE 4a
+#include <ammintrin.h>
 #endif
-#if defined __AVX__             // Intel's AVX
-#  include <immintrin.h>
+#if defined __AVX__ // Intel's AVX
+#include <immintrin.h>
 #endif
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_DOUBLE_2
-  template<> struct boolvec<double,2>;
-  template<> struct intvec<double,2>;
-  template<> struct realvec<double,2>;
-  
-  
-  
-  template<>
-  struct boolvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef bool scalar_t;
-    typedef __m128d bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - uint_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a):
-    v(_mm_castsi128_pd(_mm_set1_epi64x(from_bool(a)))) {}
-    boolvec(bool const* as):
-    v(_mm_castsi128_pd(_mm_set_epi64x(from_bool(as[1]), from_bool(as[0])))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec_t& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec_t operator!() const { return _mm_xor_pd(boolvec(true), v); }
-    
-    boolvec_t operator&&(boolvec_t x) const { return _mm_and_pd(v, x.v); }
-    boolvec_t operator||(boolvec_t x) const { return _mm_or_pd(v, x.v); }
-    boolvec_t operator==(boolvec_t x) const { return !(*this!=x); }
-    boolvec_t operator!=(boolvec_t x) const { return _mm_xor_pd(v, x.v); }
-    
-    bool all() const
-    {
+template <> struct boolvec<double, 2>;
+template <> struct intvec<double, 2>;
+template <> struct realvec<double, 2>;
+
+template <> struct boolvec<double, 2> : floatprops<double> {
+  static int const size = 2;
+  typedef bool scalar_t;
+  typedef __m128d bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values have the sign bit set, false values have it unset
+  static uint_t from_bool(bool a) { return -uint_t(a); }
+  static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(_mm_castsi128_pd(_mm_set1_epi64x(from_bool(a)))) {}
+  boolvec(bool const *as)
+      : v(_mm_castsi128_pd(
+            _mm_set_epi64x(from_bool(as[1]), from_bool(as[0])))) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec_t &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec_t operator!() const { return _mm_xor_pd(boolvec(true), v); }
+
+  boolvec_t operator&&(boolvec_t x) const { return _mm_and_pd(v, x.v); }
+  boolvec_t operator||(boolvec_t x) const { return _mm_or_pd(v, x.v); }
+  boolvec_t operator==(boolvec_t x) const { return !(*this != x); }
+  boolvec_t operator!=(boolvec_t x) const { return _mm_xor_pd(v, x.v); }
+
+  bool all() const {
 #if defined __AVX__
-      return ! (! *this).any();
+    return !(!*this).any();
 #else
-      return (*this)[0] && (*this)[1];
+    return (*this)[0] && (*this)[1];
 #endif
-    }
-    bool any() const
-    {
+  }
+  bool any() const {
 #if defined __AVX__
-      return ! bool(_mm_testz_pd(v, v));
+    return !bool(_mm_testz_pd(v, v));
 #else
-      return (*this)[0] || (*this)[1];
+    return (*this)[0] || (*this)[1];
 #endif
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 2> : floatprops<double> {
+  static int const size = 2;
+  typedef int_t scalar_t;
+  typedef __m128i ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm_set1_epi64x(a)) {}
+  intvec(int_t const *as) : v(_mm_set_epi64x(as[1], as[0])) {}
+  static intvec_t iota() { return _mm_set_epi64x(1, 0); }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  boolvec_t as_bool() const { return _mm_castsi128_pd(v); }
+  boolvec_t convert_bool() const {
+    // Result: convert_bool(0)=false, convert_bool(else)=true
+    // There is no intrinsic to compare to zero. Instead, we check
+    // whether x is positive and x-1 is negative.
+    intvec_t x = *this;
+    // We know that boolvec_t values depend only on the sign bit
+    // return (~(x-1) | x).as_bool();
+    // return x.as_bool() || !(x-1).as_bool();
+    return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec_t operator+() const { return *this; }
+  intvec_t operator-() const { return IV(I(0)) - *this; }
+
+  intvec_t operator+(intvec_t x) const { return _mm_add_epi64(v, x.v); }
+  intvec_t operator-(intvec_t x) const { return _mm_sub_epi64(v, x.v); }
+
+  intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+  intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+
+  intvec_t operator~() const { return IV(~U(0)) ^ *this; }
+
+  intvec_t operator&(intvec_t x) const {
+    return _mm_castpd_si128(
+        _mm_and_pd(_mm_castsi128_pd(v), _mm_castsi128_pd(x.v)));
+  }
+  intvec_t operator|(intvec_t x) const {
+    return _mm_castpd_si128(
+        _mm_or_pd(_mm_castsi128_pd(v), _mm_castsi128_pd(x.v)));
+  }
+  intvec_t operator^(intvec_t x) const {
+    return _mm_castpd_si128(
+        _mm_xor_pd(_mm_castsi128_pd(v), _mm_castsi128_pd(x.v)));
+  }
+
+  intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+  intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+  intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const { return _mm_srli_epi64(v, n); }
+  intvec_t rotate(int_t n) const;
+  intvec_t operator>>(int_t n) const {
+    // There is no _mm_srai_epi64. To emulate it, add 0x80000000
+    // before shifting, and subtract the shifted 0x80000000 after
+    // shifting
+    intvec_t x = *this;
+    // Convert signed to unsiged
+    x += U(1) << (bits - 1);
+    // Shift
+    x = x.lsr(n);
+    // Undo conversion
+    x -= U(1) << (bits - 1 - n);
+    return x;
+  }
+  intvec_t operator<<(int_t n) const { return _mm_slli_epi64(v, n); }
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef int_t scalar_t;
-    typedef __m128i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm_set1_epi64x(a)) {}
-    intvec(int_t const* as): v(_mm_set_epi64x(as[1], as[0])) {}
-    static intvec_t iota() { return _mm_set_epi64x(1, 0); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    boolvec_t as_bool() const { return _mm_castsi128_pd(v); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      // There is no intrinsic to compare to zero. Instead, we check
-      // whether x is positive and x-1 is negative.
-      intvec_t x = *this;
-      // We know that boolvec_t values depend only on the sign bit
-      // return (~(x-1) | x).as_bool();
-      // return x.as_bool() || !(x-1).as_bool();
-      return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec_t operator+() const { return *this; }
-    intvec_t operator-() const { return IV(I(0)) - *this; }
-    
-    intvec_t operator+(intvec_t x) const { return _mm_add_epi64(v, x.v); }
-    intvec_t operator-(intvec_t x) const { return _mm_sub_epi64(v, x.v); }
-    
-    intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
-    intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec_t operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec_t operator&(intvec_t x) const
-    {
-      return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(v),
-                                         _mm_castsi128_pd(x.v)));
-    }
-    intvec_t operator|(intvec_t x) const
-    {
-      return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(v),
-                                        _mm_castsi128_pd(x.v)));
-    }
-    intvec_t operator^(intvec_t x) const
-    {
-      return _mm_castpd_si128(_mm_xor_pd(_mm_castsi128_pd(v),
-                                         _mm_castsi128_pd(x.v)));
-    }
-    
-    intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
-    intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
-    intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const { return _mm_srli_epi64(v, n); }
-    intvec_t rotate(int_t n) const;
-    intvec_t operator>>(int_t n) const
-    {
-      // There is no _mm_srai_epi64. To emulate it, add 0x80000000
-      // before shifting, and subtract the shifted 0x80000000 after
-      // shifting
-      intvec_t x = *this;
-      // Convert signed to unsiged
-      x += U(1) << (bits-1);
-      // Shift
-      x = x.lsr(n);
-      // Undo conversion
-      x -= U(1) << (bits-1-n);
-      return x;
-    }
-    intvec_t operator<<(int_t n) const { return _mm_slli_epi64(v, n); }
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t operator>>(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec_t operator<<(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec_t const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
-    boolvec_t operator<(intvec_t const& x) const
-    {
-      // return (*this - x).as_bool();
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      return ! (*this < x);
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const { return as_bool(); }
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef real_t scalar_t;
-    typedef __m128d vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<SSE2:2*double>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm_set1_pd(a)) {}
-    realvec(real_t const* as): v(_mm_set_pd(as[1], as[0])) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm_load_pd(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm_loadu_pd(p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
+    return r;
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t operator>>(intvec_t n) const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
+    return r;
+  }
+  intvec_t operator<<(intvec_t n) const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm_store_pd(p, v);
+    return r;
+  }
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec_t const &x) const { return !(*this != x); }
+  boolvec_t operator!=(intvec_t const &x) const {
+    return (*this ^ x).convert_bool();
+  }
+  boolvec_t operator<(intvec_t const &x) const {
+    // return (*this - x).as_bool();
+    boolvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] < x[i]);
     }
-    void storeu(real_t* p) const
-    {
-      return _mm_storeu_pd(p, v);
+    return r;
+  }
+  boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 2> : floatprops<double> {
+  static int const size = 2;
+  typedef real_t scalar_t;
+  typedef __m128d vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<SSE2:2*double>"; }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm_set1_pd(a)) {}
+  realvec(real_t const *as) : v(_mm_set_pd(as[1], as[0])) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm_load_pd(p);
+  }
+  static realvec_t loadu(real_t const *p) { return _mm_loadu_pd(p); }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm_store_pd(p, v);
+  }
+  void storeu(real_t *p) const { return _mm_storeu_pd(p, v); }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
 #if defined __AVX__
-        _mm_maskstore_pd(p, m.m.as_int(), v);
+      _mm_maskstore_pd(p, m.m.as_int(), v);
 #else
-        if      (m.m[0]) _mm_storel_pd(p  , v);
-        else if (m.m[1]) _mm_storeh_pd(p+1, v);
+      if (m.m[0])
+        _mm_storel_pd(p, v);
+      else if (m.m[1])
+        _mm_storeh_pd(p + 1, v);
 #endif
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if      (m.m[0]) _mm_storel_pd(p  , v);
-        else if (m.m[1]) _mm_storeh_pd(p+1, v);
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
     }
-    
-    
-    
-    intvec_t as_int() const { return _mm_castpd_si128(v); }
-    intvec_t convert_int() const
-    {
-      intvec_t r;
-      r.set_elt(0, floatprops::convert_int((*this)[0]));
-      r.set_elt(1, floatprops::convert_int((*this)[1]));
-      return r;
-    }
-    
-    
-    
-    realvec_t operator+() const { return *this; }
-    realvec_t operator-() const { return RV(0.0) - *this; }
-    
-    realvec_t operator+(realvec_t x) const { return _mm_add_pd(v, x.v); }
-    realvec_t operator-(realvec_t x) const { return _mm_sub_pd(v, x.v); }
-    realvec_t operator*(realvec_t x) const { return _mm_mul_pd(v, x.v); }
-    realvec_t operator/(realvec_t x) const { return _mm_div_pd(v, x.v); }
-    
-    realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
-    realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
-    realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
-    realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      return vml_std::fmax((*this)[0], (*this)[1]);
-    }
-    real_t minval() const
-    {
-      return vml_std::fmin((*this)[0], (*this)[1]);
-    }
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1];
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      if (m.m[0])
+        _mm_storel_pd(p, v);
+      else if (m.m[1])
+        _mm_storeh_pd(p + 1, v);
     }
-    real_t sum() const
-    {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return _mm_castpd_si128(v); }
+  intvec_t convert_int() const {
+    intvec_t r;
+    r.set_elt(0, floatprops::convert_int((*this)[0]));
+    r.set_elt(1, floatprops::convert_int((*this)[1]));
+    return r;
+  }
+
+  realvec_t operator+() const { return *this; }
+  realvec_t operator-() const { return RV(0.0) - *this; }
+
+  realvec_t operator+(realvec_t x) const { return _mm_add_pd(v, x.v); }
+  realvec_t operator-(realvec_t x) const { return _mm_sub_pd(v, x.v); }
+  realvec_t operator*(realvec_t x) const { return _mm_mul_pd(v, x.v); }
+  realvec_t operator/(realvec_t x) const { return _mm_div_pd(v, x.v); }
+
+  realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+  realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+  realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+  realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+  real_t maxval() const { return vml_std::fmax((*this)[0], (*this)[1]); }
+  real_t minval() const { return vml_std::fmin((*this)[0], (*this)[1]); }
+  real_t prod() const { return (*this)[0] * (*this)[1]; }
+  real_t sum() const {
 #ifdef __SSE3__
-      return _mm_cvtsd_f64(_mm_hadd_pd(v, v));
+    return _mm_cvtsd_f64(_mm_hadd_pd(v, v));
 #else
-      return (*this)[0] + (*this)[1];
+    return (*this)[0] + (*this)[1];
 #endif
-    }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const
-    {
-      return _mm_cmpeq_pd(v, x.v);
-    }
-    boolvec_t operator!=(realvec_t const& x) const
-    {
-      return _mm_cmpneq_pd(v, x.v);
-    }
-    boolvec_t operator<(realvec_t const& x) const
-    {
-      return _mm_cmplt_pd(v, x.v);
-    }
-    boolvec_t operator<=(realvec_t const& x) const
-    {
-      return _mm_cmple_pd(v, x.v);
-    }
-    boolvec_t operator>(realvec_t const& x) const
-    {
-      return _mm_cmpgt_pd(v, x.v);
-    }
-    boolvec_t operator>=(realvec_t const& x) const
-    {
-      return _mm_cmpge_pd(v, x.v);
-    }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const
-    {
+  }
+
+  boolvec_t operator==(realvec_t const &x) const {
+    return _mm_cmpeq_pd(v, x.v);
+  }
+  boolvec_t operator!=(realvec_t const &x) const {
+    return _mm_cmpneq_pd(v, x.v);
+  }
+  boolvec_t operator<(realvec_t const &x) const { return _mm_cmplt_pd(v, x.v); }
+  boolvec_t operator<=(realvec_t const &x) const {
+    return _mm_cmple_pd(v, x.v);
+  }
+  boolvec_t operator>(realvec_t const &x) const { return _mm_cmpgt_pd(v, x.v); }
+  boolvec_t operator>=(realvec_t const &x) const {
+    return _mm_cmpge_pd(v, x.v);
+  }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const {
 #ifdef __SSE4_1__
-      return _mm_ceil_pd(v);
+    return _mm_ceil_pd(v);
 #else
-      return MF::vml_ceil(*this);
+    return MF::vml_ceil(*this);
 #endif
- }
-    realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return MF::vml_fabs(*this); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const
-    {
+  }
+  realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return MF::vml_fabs(*this); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const {
 #ifdef __SSE4_1__
-      return _mm_floor_pd(v);
+    return _mm_floor_pd(v);
 #else
-      return MF::vml_floor(*this);
+    return MF::vml_floor(*this);
 #endif
- }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
-    }
-    realvec_t fmax(realvec_t y) const { return _mm_max_pd(v, y.v); }
-    realvec_t fmin(realvec_t y) const { return _mm_min_pd(v, y.v); }
-    realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
-    realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const
-    {
+  }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return _mm_max_pd(v, y.v); }
+  realvec_t fmin(realvec_t y) const { return _mm_min_pd(v, y.v); }
+  realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+  realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const {
 #ifdef VML_HAVE_NAN
-      return _mm_cmpunord_pd(v, v);
+    return _mm_cmpunord_pd(v, v);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const { return _mm_div_pd(_mm_set1_pd(1.0), v); }
-    realvec_t remainder(realvec_t y) const
-    {
-      return MF::vml_remainder(*this, y);
-    }
-    realvec_t rint() const
-    {
+  }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const { return _mm_div_pd(_mm_set1_pd(1.0), v); }
+  realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+  realvec_t rint() const {
 #ifdef __SSE4_1__
-      return _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
+    return _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
 #else
-      return MF::vml_rint(*this);
+    return MF::vml_rint(*this);
 #endif
-    }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return v; }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const { return _mm_sqrt_pd(v); }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const
-    {
+  }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+  boolvec_t signbit() const { return v; }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const { return _mm_sqrt_pd(v); }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const {
 #ifdef __SSE4_1__
-      return _mm_round_pd(v, _MM_FROUND_TO_ZERO);
+    return _mm_round_pd(v, _MM_FROUND_TO_ZERO);
 #else
-      return MF::vml_trunc(*this);
+    return MF::vml_trunc(*this);
 #endif
- }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<double,2> boolvec<double,2>::as_int() const
-  {
-    return _mm_castpd_si128(v);
-  }
-  
-  inline intvec<double,2> boolvec<double,2>::convert_int() const
-  {
-    //return ifthen(v, U(1), U(0));
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  boolvec<double,2> boolvec<double,2>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline
-  intvec<double,2> boolvec<double,2>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  realvec<double,2> boolvec<double,2>::ifthen(realvec_t x, realvec_t y) const
-  {
+  }
+};
+
+// boolvec definitions
+
+inline intvec<double, 2> boolvec<double, 2>::as_int() const {
+  return _mm_castpd_si128(v);
+}
+
+inline intvec<double, 2> boolvec<double, 2>::convert_int() const {
+  // return ifthen(v, U(1), U(0));
+  return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<double, 2> boolvec<double, 2>::ifthen(boolvec_t x,
+                                                     boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<double, 2> boolvec<double, 2>::ifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<double, 2> boolvec<double, 2>::ifthen(realvec_t x,
+                                                     realvec_t y) const {
 #ifdef __SSE4_1__
-    return _mm_blendv_pd(y.v, x.v, v);
+  return _mm_blendv_pd(y.v, x.v, v);
 #else
-    return (( -convert_int() & x.as_int()) |
-            (~-convert_int() & y.as_int())).as_float();
+  return ((-convert_int() & x.as_int()) | (~ - convert_int() & y.as_int()))
+      .as_float();
 #endif
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<double,2> intvec<double,2>::as_float() const
-  {
-    return _mm_castsi128_pd(v);
-  }
-  
-  inline realvec<double,2> intvec<double,2>::convert_float() const
-  {
-    realvec_t r;
-    r.set_elt(0, floatprops::convert_float((*this)[0]));
-    r.set_elt(1, floatprops::convert_float((*this)[1]));
-    return r;
-  }
-  
-  inline intvec<double,2> intvec<double,2>::abs() const
-  {
-    return MF::vml_abs(*this);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::bitifthen(intvec_t x,
-                                                      intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+}
+
+// intvec definitions
+
+inline realvec<double, 2> intvec<double, 2>::as_float() const {
+  return _mm_castsi128_pd(v);
+}
+
+inline realvec<double, 2> intvec<double, 2>::convert_float() const {
+  realvec_t r;
+  r.set_elt(0, floatprops::convert_float((*this)[0]));
+  r.set_elt(1, floatprops::convert_float((*this)[1]));
+  return r;
+}
+
+inline intvec<double, 2> intvec<double, 2>::abs() const {
+  return MF::vml_abs(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::bitifthen(intvec_t x,
+                                                      intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 2> intvec<double, 2>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 2> intvec<double, 2>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 2> intvec<double, 2>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 2> intvec<double, 2>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_SSE_DOUBLE2_H
+#endif // #ifndef VEC_SSE_DOUBLE2_H
diff --git a/vec_sse_float1.h b/vec_sse_float1.h
index 9cee891..a84a046 100644
--- a/vec_sse_float1.h
+++ b/vec_sse_float1.h
@@ -12,583 +12,489 @@
 
 // SSE2 intrinsics
 #include <emmintrin.h>
-#ifdef __SSE3__                 // Intel's SSE 3
-#  include <pmmintrin.h>
+#ifdef __SSE3__ // Intel's SSE 3
+#include <pmmintrin.h>
 #endif
-#ifdef __SSE4_1__               // Intel's SSE 4.1
-#  include <smmintrin.h>
+#ifdef __SSE4_1__ // Intel's SSE 4.1
+#include <smmintrin.h>
 #endif
-#ifdef __SSE4A__                // AMD's SSE 4a
-#  include <ammintrin.h>
+#ifdef __SSE4A__ // AMD's SSE 4a
+#include <ammintrin.h>
 #endif
-#if defined __AVX__             // Intel's AVX
-#  include <immintrin.h>
+#if defined __AVX__ // Intel's AVX
+#include <immintrin.h>
 #endif
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FLOAT_1
-  template<> struct boolvec<float,1>;
-  template<> struct intvec<float,1>;
-  template<> struct realvec<float,1>;
-  
-  
-  
-  template<>
-  struct boolvec<float,1>: floatprops<float>
-  {
-    static int const size = 1;
-    typedef bool scalar_t;
-    typedef uint_t bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-    // true values are non-zero, false values are zero
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v(a) {}
-    boolvec(bool const* as): v(as[0]) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const { return v; }
-    boolvec_t& set_elt(int n, bool a) { return v=a, *this; }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec_t operator!() const { return !v; }
-    
-    boolvec_t operator&&(boolvec_t x) const { return v && x.v; }
-    boolvec_t operator||(boolvec_t x) const { return v || x.v; }
-    boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); }
-    boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); }
-    
-    bool all() const { return *this; }
-    bool any() const { return *this; }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,1>: floatprops<float>
-  {
-    static int const size = 1;
-    typedef int_t scalar_t;
-    typedef int_t ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(int_t a): v(a) {}
-    intvec(int_t const* as): v(as[0]) {}
-    static intvec_t iota() { return intvec(I(0)); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const { return v; }
-    intvec_t& set_elt(int n, int_t a) { return v=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return U(v); }
-    boolvec_t convert_bool() const { return bool(v); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    intvec_t operator+() const { return +v; }
-    intvec_t operator-() const { return -v; }
-    
-    intvec_t operator+(intvec_t x) const { return v+x.v; }
-    intvec_t operator-(intvec_t x) const { return v-x.v; }
-    intvec_t operator*(intvec_t x) const { return v*x.v; }
-    intvec_t operator/(intvec_t x) const { return v/x.v; }
-    intvec_t operator%(intvec_t x) const { return v%x.v; }
-    
-    intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
-    intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-    intvec_t& operator*=(intvec_t const& x) { return *this=*this*x; }
-    intvec_t& operator/=(intvec_t const& x) { return *this=*this/x; }
-    intvec_t& operator%=(intvec_t const& x) { return *this=*this%x; }
-    
-    
-    
-    intvec_t operator~() const { return ~v; }
-    
-    intvec_t operator&(intvec_t x) const { return v&x.v; }
-    intvec_t operator|(intvec_t x) const { return v|x.v; }
-    intvec_t operator^(intvec_t x) const { return v^x.v; }
-    
-    intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
-    intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
-    intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const { return U(v) >> U(n); }
-    intvec_t rotate(int_t n) const;
-    intvec_t operator>>(int_t n) const { return v>>n; }
-    intvec_t operator<<(int_t n) const { return v<<n; }
-    
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const { return U(v) >> U(n); }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t operator>>(intvec_t n) const { return v>>n; }
-    intvec_t operator<<(intvec_t n) const { return v<<n; }
-    
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const { return __builtin_clz(v); }
-    intvec_t popcount() const { return __builtin_popcount(v); }
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const { return v==x.v; }
-    boolvec_t operator!=(intvec_t const& x) const { return v!=x.v; }
-    boolvec_t operator<(intvec_t const& x) const { return v<x.v; }
-    boolvec_t operator<=(intvec_t const& x) const { return v<=x.v; }
-    boolvec_t operator>(intvec_t const& x) const { return v>x.v; }
-    boolvec_t operator>=(intvec_t const& x) const { return v>=x.v; }
-    
-    intvec_t abs() const { return std::abs(v); }
-    boolvec_t isignbit() const { return v<0; }
-    intvec_t max(intvec_t x) const { return std::max(v, x.v); }
-    intvec_t min(intvec_t x) const { return std::min(v, x.v); }
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,1>: floatprops<float>
-  {
-    static int const size = 1;
-    typedef real_t scalar_t;
-    typedef float vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<SSE2:1*float>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-  private:
-    static __m128 from_float(float a) { return _mm_set_ss(a); }
-    static float to_float(__m128 a) { return _mm_cvtss_f32(a); }
-  public:
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(real_t a): v(a) {}
-    realvec(real_t const* as): v(as[0]) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const { return v; }
-    realvec_t& set_elt(int n, real_t a) { return v=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return *p;
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return *p;
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loada(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return *this;
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return *this;
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loada(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      *p = v;
-    }
-    void storeu(real_t* p) const
-    {
-      *p = v;
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storea(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      }
+template <> struct boolvec<float, 1>;
+template <> struct intvec<float, 1>;
+template <> struct realvec<float, 1>;
+
+template <> struct boolvec<float, 1> : floatprops<float> {
+  static int const size = 1;
+  typedef bool scalar_t;
+  typedef uint_t bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+  // true values are non-zero, false values are zero
+
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(a) {}
+  boolvec(bool const *as) : v(as[0]) {}
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const { return v; }
+  boolvec_t &set_elt(int n, bool a) { return v = a, *this; }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec_t operator!() const { return !v; }
+
+  boolvec_t operator&&(boolvec_t x) const { return v && x.v; }
+  boolvec_t operator||(boolvec_t x) const { return v || x.v; }
+  boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); }
+  boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); }
+
+  bool all() const { return *this; }
+  bool any() const { return *this; }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 1> : floatprops<float> {
+  static int const size = 1;
+  typedef int_t scalar_t;
+  typedef int_t ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(int_t a) : v(a) {}
+  intvec(int_t const *as) : v(as[0]) {}
+  static intvec_t iota() { return intvec(I(0)); }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const { return v; }
+  intvec_t &set_elt(int n, int_t a) { return v = a, *this; }
+
+  boolvec_t as_bool() const { return U(v); }
+  boolvec_t convert_bool() const { return bool(v); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  intvec_t operator+() const { return +v; }
+  intvec_t operator-() const { return -v; }
+
+  intvec_t operator+(intvec_t x) const { return v + x.v; }
+  intvec_t operator-(intvec_t x) const { return v - x.v; }
+  intvec_t operator*(intvec_t x) const { return v * x.v; }
+  intvec_t operator/(intvec_t x) const { return v / x.v; }
+  intvec_t operator%(intvec_t x) const { return v % x.v; }
+
+  intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+  intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+  intvec_t &operator*=(intvec_t const &x) { return *this = *this * x; }
+  intvec_t &operator/=(intvec_t const &x) { return *this = *this / x; }
+  intvec_t &operator%=(intvec_t const &x) { return *this = *this % x; }
+
+  intvec_t operator~() const { return ~v; }
+
+  intvec_t operator&(intvec_t x) const { return v & x.v; }
+  intvec_t operator|(intvec_t x) const { return v | x.v; }
+  intvec_t operator^(intvec_t x) const { return v ^ x.v; }
+
+  intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+  intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+  intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const { return U(v) >> U(n); }
+  intvec_t rotate(int_t n) const;
+  intvec_t operator>>(int_t n) const { return v >> n; }
+  intvec_t operator<<(int_t n) const { return v << n; }
+
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const { return U(v) >> U(n); }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t operator>>(intvec_t n) const { return v >> n; }
+  intvec_t operator<<(intvec_t n) const { return v << n; }
+
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const { return __builtin_clz(v); }
+  intvec_t popcount() const { return __builtin_popcount(v); }
+
+  boolvec_t operator==(intvec_t const &x) const { return v == x.v; }
+  boolvec_t operator!=(intvec_t const &x) const { return v != x.v; }
+  boolvec_t operator<(intvec_t const &x) const { return v < x.v; }
+  boolvec_t operator<=(intvec_t const &x) const { return v <= x.v; }
+  boolvec_t operator>(intvec_t const &x) const { return v > x.v; }
+  boolvec_t operator>=(intvec_t const &x) const { return v >= x.v; }
+
+  intvec_t abs() const { return std::abs(v); }
+  boolvec_t isignbit() const { return v < 0; }
+  intvec_t max(intvec_t x) const { return std::max(v, x.v); }
+  intvec_t min(intvec_t x) const { return std::min(v, x.v); }
+};
+
+template <> struct realvec<float, 1> : floatprops<float> {
+  static int const size = 1;
+  typedef real_t scalar_t;
+  typedef float vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<SSE2:1*float>"; }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+private:
+  static __m128 from_float(float a) { return _mm_set_ss(a); }
+  static float to_float(__m128 a) { return _mm_cvtss_f32(a); }
+
+public:
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(real_t a) : v(a) {}
+  realvec(real_t const *as) : v(as[0]) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const { return v; }
+  realvec_t &set_elt(int n, real_t a) { return v = a, *this; }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return *p;
+  }
+  static realvec_t loadu(real_t const *p) { return *p; }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loada(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return *this;
     }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      }
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return *this;
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storea(p+ioff, m);
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loada(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    *p = v;
+  }
+  void storeu(real_t *p) const { *p = v; }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storea(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
     }
-    
-    
-    
-    intvec_t as_int() const { return floatprops::as_int(v); }
-    intvec_t convert_int() const {
-      // return floatprops::convert_int(v);
-      return _mm_cvttss_si32(_mm_set_ss(v));
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
     }
-    
-    
-    
-    realvec_t operator+() const { return +v; }
-    realvec_t operator-() const { return -v; }
-    
-    realvec_t operator+(realvec_t x) const { return v+x.v; }
-    realvec_t operator-(realvec_t x) const { return v-x.v; }
-    realvec_t operator*(realvec_t x) const { return v*x.v; }
-    realvec_t operator/(realvec_t x) const { return v/x.v; }
-    
-    realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
-    realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
-    realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
-    realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-    
-    real_t maxval() const { return *this; }
-    real_t minval() const { return *this; }
-    real_t prod() const { return *this; }
-    real_t sum() const { return *this; }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const { return v==x.v; }
-    boolvec_t operator!=(realvec_t const& x) const { return v!=x.v; }
-    boolvec_t operator<(realvec_t const& x) const { return v<x.v; }
-    boolvec_t operator<=(realvec_t const& x) const { return v<=x.v; }
-    boolvec_t operator>(realvec_t const& x) const { return v>x.v; }
-    boolvec_t operator>=(realvec_t const& x) const { return v>=x.v; }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const
-    {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storea(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return floatprops::as_int(v); }
+  intvec_t convert_int() const {
+    // return floatprops::convert_int(v);
+    return _mm_cvttss_si32(_mm_set_ss(v));
+  }
+
+  realvec_t operator+() const { return +v; }
+  realvec_t operator-() const { return -v; }
+
+  realvec_t operator+(realvec_t x) const { return v + x.v; }
+  realvec_t operator-(realvec_t x) const { return v - x.v; }
+  realvec_t operator*(realvec_t x) const { return v * x.v; }
+  realvec_t operator/(realvec_t x) const { return v / x.v; }
+
+  realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+  realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+  realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+  realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+  real_t maxval() const { return *this; }
+  real_t minval() const { return *this; }
+  real_t prod() const { return *this; }
+  real_t sum() const { return *this; }
+
+  boolvec_t operator==(realvec_t const &x) const { return v == x.v; }
+  boolvec_t operator!=(realvec_t const &x) const { return v != x.v; }
+  boolvec_t operator<(realvec_t const &x) const { return v < x.v; }
+  boolvec_t operator<=(realvec_t const &x) const { return v <= x.v; }
+  boolvec_t operator>(realvec_t const &x) const { return v > x.v; }
+  boolvec_t operator>=(realvec_t const &x) const { return v >= x.v; }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const {
 #ifdef __SSE4_1__
-      return to_float(_mm_ceil_ss(from_float(v), from_float(v)));
+    return to_float(_mm_ceil_ss(from_float(v), from_float(v)));
 #else
-      return vml_std::ceil(v);
+    return vml_std::ceil(v);
 #endif
-    }
-    realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return vml_std::fabs(v); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const
-    {
+  }
+  realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return vml_std::fabs(v); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const {
 #ifdef __SSE4_1__
-      return to_float(_mm_floor_ss(from_float(v), from_float(v)));
+    return to_float(_mm_floor_ss(from_float(v), from_float(v)));
 #else
-      return vml_std::floor(v);
+    return vml_std::floor(v);
 #endif
-    }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
-    }
-    realvec_t fmax(realvec_t y) const
-    {
-      return to_float(_mm_max_ss(from_float(v), from_float(y.v)));
-    }
-    realvec_t fmin(realvec_t y) const
-    {
-      return to_float(_mm_min_ss(from_float(v), from_float(y.v)));
-    }
-    realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
-    realvec_t frexp(intvec_t* irp) const
-    {
-      int iri;
-      realvec_t r = vml_std::frexp(v, &iri);
-      int_t ir = iri;
-      if (isinf()) ir = std::numeric_limits<int_t>::max();
-      if (isnan()) ir = std::numeric_limits<int_t>::min();
-      irp->v = ir;
-      return r;
-    }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const
-    {
-      int_t r = vml_std::ilogb(v);
-      typedef std::numeric_limits<int_t> NL;
-      if (FP_ILOGB0 != NL::min() and *this == RV(R(0.0))) {
-        r = NL::min();
+  }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const {
+    return to_float(_mm_max_ss(from_float(v), from_float(y.v)));
+  }
+  realvec_t fmin(realvec_t y) const {
+    return to_float(_mm_min_ss(from_float(v), from_float(y.v)));
+  }
+  realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
+  realvec_t frexp(intvec_t *irp) const {
+    int iri;
+    realvec_t r = vml_std::frexp(v, &iri);
+    int_t ir = iri;
+    if (isinf())
+      ir = std::numeric_limits<int_t>::max();
+    if (isnan())
+      ir = std::numeric_limits<int_t>::min();
+    irp->v = ir;
+    return r;
+  }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const {
+    int_t r = vml_std::ilogb(v);
+    typedef std::numeric_limits<int_t> NL;
+    if (FP_ILOGB0 != NL::min() and *this == RV(R(0.0))) {
+      r = NL::min();
 #if defined VML_HAVE_INF
-      } else if (INT_MAX != NL::max() and vml_std::isinf(v)) {
-        r = NL::max();
+    } else if (INT_MAX != NL::max() and vml_std::isinf(v)) {
+      r = NL::max();
 #endif
 #if defined VML_HAVE_NAN
-      } else if (FP_ILOGBNAN != NL::min() and isnan()) {
-        r = NL::min();
+    } else if (FP_ILOGBNAN != NL::min() and isnan()) {
+      r = NL::min();
 #endif
-      }
-      return r;
     }
-    boolvec_t isfinite() const { return vml_std::isfinite(v); }
-    boolvec_t isinf() const { return vml_std::isinf(v); }
-    boolvec_t isnan() const
-    {
+    return r;
+  }
+  boolvec_t isfinite() const { return vml_std::isfinite(v); }
+  boolvec_t isinf() const { return vml_std::isinf(v); }
+  boolvec_t isnan() const {
 #if defined VML_HAVE_NAN
-      // This is wrong:
-      // return _mm_ucomineq_ss(from_float(v), from_float(v));
-      // This works:
-      // char r;
-      // __asm__("ucomiss %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v));
-      // return boolvec_t::scalar_t(r);
-      // This works as well:
-      return vml_std::isnan(v);
+    // This is wrong:
+    // return _mm_ucomineq_ss(from_float(v), from_float(v));
+    // This works:
+    // char r;
+    // __asm__("ucomiss %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v));
+    // return boolvec_t::scalar_t(r);
+    // This works as well:
+    return vml_std::isnan(v);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return vml_std::isnormal(v); }
-    realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); }
-    realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const { return R(1.0)/v; }
-    realvec_t remainder(realvec_t y) const
-    {
-      return vml_std::remainder(v, y.v);
-    }
-    realvec_t rint() const
-    {
+  }
+  boolvec_t isnormal() const { return vml_std::isnormal(v); }
+  realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); }
+  realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const { return R(1.0) / v; }
+  realvec_t remainder(realvec_t y) const { return vml_std::remainder(v, y.v); }
+  realvec_t rint() const {
 #ifdef __SSE4_1__
-      return to_float(_mm_round_ss(from_float(v), from_float(v),
-                                   _MM_FROUND_TO_NEAREST_INT));
+    return to_float(
+        _mm_round_ss(from_float(v), from_float(v), _MM_FROUND_TO_NEAREST_INT));
 #else
-      return MF::vml_rint(*this);
+    return MF::vml_rint(*this);
 #endif
-    }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return vml_std::signbit(v); }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const { return to_float(_mm_sqrt_ss(from_float(v))); }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const
-    {
+  }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+  boolvec_t signbit() const { return vml_std::signbit(v); }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const { return to_float(_mm_sqrt_ss(from_float(v))); }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const {
 #ifdef __SSE4_1__
-      return to_float(_mm_round_ss(from_float(v), from_float(v),
-                                   _MM_FROUND_TO_ZERO));
+    return to_float(
+        _mm_round_ss(from_float(v), from_float(v), _MM_FROUND_TO_ZERO));
 #else
-      return MF::vml_trunc(*this);
+    return MF::vml_trunc(*this);
 #endif
-    }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<float,1> boolvec<float,1>::as_int() const
-  {
-    return I(v);
-  }
-  
-  inline intvec<float,1> boolvec<float,1>::convert_int() const
-  {
-    return v;
-  }
-  
-  inline
-  boolvec<float,1> boolvec<float,1>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return v ? x : y;
-  }
-  
-  inline intvec<float,1> boolvec<float,1>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return v ? x : y;
-  }
-  
-  inline
-  realvec<float,1> boolvec<float,1>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return v ? x : y;
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline realvec<float,1> intvec<float,1>::as_float() const
-  {
-    return FP::as_float(v);
-  }
-  
-  inline intvec<float,1> intvec<float,1>::bitifthen(intvec_t x,
-                                                    intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline realvec<float,1> intvec<float,1>::convert_float() const
-  {
-    // return FP::convert_float(v);
-    return _mm_cvtss_f32(_mm_cvtsi32_ss(_mm_setzero_ps(), v));
   }
-  
-  inline intvec<float,1> intvec<float,1>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<float,1> intvec<float,1>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+};
+
+// boolvec definitions
+
+inline intvec<float, 1> boolvec<float, 1>::as_int() const { return I(v); }
+
+inline intvec<float, 1> boolvec<float, 1>::convert_int() const { return v; }
+
+inline boolvec<float, 1> boolvec<float, 1>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return v ? x : y;
+}
+
+inline intvec<float, 1> boolvec<float, 1>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return v ? x : y;
+}
+
+inline realvec<float, 1> boolvec<float, 1>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
+  return v ? x : y;
+}
+
+// intvec definitions
+
+inline realvec<float, 1> intvec<float, 1>::as_float() const {
+  return FP::as_float(v);
+}
+
+inline intvec<float, 1> intvec<float, 1>::bitifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline realvec<float, 1> intvec<float, 1>::convert_float() const {
+  // return FP::convert_float(v);
+  return _mm_cvtss_f32(_mm_cvtsi32_ss(_mm_setzero_ps(), v));
+}
+
+inline intvec<float, 1> intvec<float, 1>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 1> intvec<float, 1>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_SSE_FLOAT1_H
+#endif // #ifndef VEC_SSE_FLOAT1_H
diff --git a/vec_sse_float4.h b/vec_sse_float4.h
index 940de67..f8e8e80 100644
--- a/vec_sse_float4.h
+++ b/vec_sse_float4.h
@@ -11,766 +11,642 @@
 
 // SSE2 intrinsics
 #include <xmmintrin.h>
-#ifdef __SSE3__                 // Intel's SSE 3
-#  include <pmmintrin.h>
+#ifdef __SSE3__ // Intel's SSE 3
+#include <pmmintrin.h>
 #endif
-#ifdef __SSSE3__                // Intel's SSSE 3
-#  include <tmmintrin.h>
+#ifdef __SSSE3__ // Intel's SSSE 3
+#include <tmmintrin.h>
 #endif
-#if defined __SSE4_1__          // Intel's SSE 4.1
-#  include <smmintrin.h>
+#if defined __SSE4_1__ // Intel's SSE 4.1
+#include <smmintrin.h>
 #endif
-#if defined __SSE4A__           // AMD's SSE 4a
-#  include <ammintrin.h>
+#if defined __SSE4A__ // AMD's SSE 4a
+#include <ammintrin.h>
 #endif
-#if defined __AVX__             // Intel's AVX
-#  include <immintrin.h>
+#if defined __AVX__ // Intel's AVX
+#include <immintrin.h>
 #endif
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_FLOAT_4
-  template<> struct boolvec<float,4>;
-  template<> struct intvec<float,4>;
-  template<> struct realvec<float,4>;
-  
-  
-  
-  template<>
-  struct boolvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef bool scalar_t;
-    typedef __m128 bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values have the sign bit set, false values have it unset
-    static uint_t from_bool(bool a) { return - int_t(a); }
-    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a):
-    v(_mm_castsi128_ps(_mm_set1_epi32(from_bool(a)))) {}
-    boolvec(bool const* as):
-    v(_mm_castsi128_ps(_mm_set_epi32(from_bool(as[3]),
-                                     from_bool(as[2]),
-                                     from_bool(as[1]),
-                                     from_bool(as[0])))) {}
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec_t& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec_t operator!() const { return _mm_xor_ps(boolvec(true), v); }
-    
-    boolvec_t operator&&(boolvec_t x) const { return _mm_and_ps(v, x.v); }
-    boolvec_t operator||(boolvec_t x) const { return _mm_or_ps(v, x.v); }
-    boolvec_t operator==(boolvec_t x) const { return !(*this!=x); }
-    boolvec_t operator!=(boolvec_t x) const { return _mm_xor_ps(v, x.v); }
-    
-    bool all() const
-    {
-      // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+template <> struct boolvec<float, 4>;
+template <> struct intvec<float, 4>;
+template <> struct realvec<float, 4>;
+
+template <> struct boolvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef bool scalar_t;
+  typedef __m128 bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values have the sign bit set, false values have it unset
+  static uint_t from_bool(bool a) { return -int_t(a); }
+  static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a) : v(_mm_castsi128_ps(_mm_set1_epi32(from_bool(a)))) {}
+  boolvec(bool const *as)
+      : v(_mm_castsi128_ps(_mm_set_epi32(from_bool(as[3]), from_bool(as[2]),
+                                         from_bool(as[1]), from_bool(as[0])))) {
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec_t &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec_t operator!() const { return _mm_xor_ps(boolvec(true), v); }
+
+  boolvec_t operator&&(boolvec_t x) const { return _mm_and_ps(v, x.v); }
+  boolvec_t operator||(boolvec_t x) const { return _mm_or_ps(v, x.v); }
+  boolvec_t operator==(boolvec_t x) const { return !(*this != x); }
+  boolvec_t operator!=(boolvec_t x) const { return _mm_xor_ps(v, x.v); }
+
+  bool all() const {
+// return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
 #if defined __AVX__
-      return ! (! *this).any();
+    return !(!*this).any();
 #else
-      boolvec_t x = *this;
-      x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1));
-      return x[0] && x[2];
+    boolvec_t x = *this;
+    x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1));
+    return x[0] && x[2];
 #endif
-    }
-    bool any() const
-    {
-      // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+  }
+  bool any() const {
+// return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
 #if defined __AVX__
-      return ! bool(_mm_testz_ps(v, v));
+    return !bool(_mm_testz_ps(v, v));
 #else
-      boolvec_t x = *this;
-      x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1));
-      return x[0] || x[2];
+    boolvec_t x = *this;
+    x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1));
+    return x[0] || x[2];
 #endif
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef int_t scalar_t;
+  typedef __m128i ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(_mm_set1_epi32(a)) {}
+  intvec(int_t const *as) : v(_mm_set_epi32(as[3], as[2], as[1], as[0])) {}
+  static intvec_t iota() { return _mm_set_epi32(3, 2, 1, 0); }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  boolvec_t as_bool() const { return _mm_castsi128_ps(v); }
+  boolvec_t convert_bool() const {
+    // Result: convert_bool(0)=false, convert_bool(else)=true
+    return !IV(_mm_cmpeq_epi32(v, IV(0))).as_bool();
+  }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Note: not all arithmetic operations are supported!
+
+  intvec_t operator+() const { return *this; }
+  intvec_t operator-() const { return IV(0) - *this; }
+
+  intvec_t operator+(intvec_t x) const { return _mm_add_epi32(v, x.v); }
+  intvec_t operator-(intvec_t x) const { return _mm_sub_epi32(v, x.v); }
+
+  intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+  intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+
+  intvec_t operator~() const { return IV(~U(0)) ^ *this; }
+
+  intvec_t operator&(intvec_t x) const {
+    return _mm_castps_si128(
+        _mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(x.v)));
+  }
+  intvec_t operator|(intvec_t x) const {
+    return _mm_castps_si128(
+        _mm_or_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(x.v)));
+  }
+  intvec_t operator^(intvec_t x) const {
+    return _mm_castps_si128(
+        _mm_xor_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(x.v)));
+  }
+
+  intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+  intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+  intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec_t lsr(int_t n) const { return _mm_srli_epi32(v, n); }
+  intvec_t rotate(int_t n) const;
+  intvec_t operator>>(int_t n) const { return _mm_srai_epi32(v, n); }
+  intvec_t operator<<(int_t n) const { return _mm_slli_epi32(v, n); }
+  intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec_t lsr(intvec_t n) const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef int_t scalar_t;
-    typedef __m128i ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(_mm_set1_epi32(a)) {}
-    intvec(int_t const* as): v(_mm_set_epi32(as[3], as[2], as[1], as[0])) {}
-    static intvec_t iota() { return _mm_set_epi32(3, 2, 1, 0); }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    boolvec_t as_bool() const { return _mm_castsi128_ps(v); }
-    boolvec_t convert_bool() const
-    {
-      // Result: convert_bool(0)=false, convert_bool(else)=true
-      return ! IV(_mm_cmpeq_epi32(v, IV(0))).as_bool();
-    }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Note: not all arithmetic operations are supported!
-    
-    intvec_t operator+() const { return *this; }
-    intvec_t operator-() const { return IV(0) - *this; }
-    
-    intvec_t operator+(intvec_t x) const { return _mm_add_epi32(v, x.v); }
-    intvec_t operator-(intvec_t x) const { return _mm_sub_epi32(v, x.v); }
-    
-    intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
-    intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-    
-    
-    
-    intvec_t operator~() const { return IV(~U(0)) ^ *this; }
-    
-    intvec_t operator&(intvec_t x) const
-    {
-      return _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v),
-                                         _mm_castsi128_ps(x.v)));
-    }
-    intvec_t operator|(intvec_t x) const
-    {
-      return _mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(v),
-                                        _mm_castsi128_ps(x.v)));
-    }
-    intvec_t operator^(intvec_t x) const
-    {
-      return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(v),
-                                         _mm_castsi128_ps(x.v)));
-    }
-    
-    intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
-    intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
-    intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec_t lsr(int_t n) const { return _mm_srli_epi32(v, n); }
-    intvec_t rotate(int_t n) const;
-    intvec_t operator>>(int_t n) const { return _mm_srai_epi32(v, n); }
-    intvec_t operator<<(int_t n) const { return _mm_slli_epi32(v, n); }
-    intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec_t operator>>(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec_t operator<<(intvec_t n) const
-    {
-      intvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
-    intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const
-    {
-      return ! (*this != x);
-    }
-    boolvec_t operator!=(intvec_t const& x) const
-    {
-      return (*this ^ x).convert_bool();
-    }
-    boolvec_t operator<(intvec_t const& x) const
-    {
-      // return (*this - x).as_bool();
-      boolvec_t r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] < x[i]);
-      }
-      return r;
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      return ! (*this < x);
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const { return as_bool(); }
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<float,4>: floatprops<float>
-  {
-    static int const size = 4;
-    typedef real_t scalar_t;
-    typedef __m128 vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<SSE2:4*float>"; }
-    void barrier() { __asm__("": "+x"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(_mm_set1_ps(a)) {}
-    realvec(real_t const* as): v(_mm_set_ps(as[3], as[2], as[1], as[0])) {}
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return _mm_load_ps(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      return _mm_loadu_ps(p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      if (ioff==0) return loada(p);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
+    return r;
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec_t operator>>(intvec_t n) const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
     }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
+    return r;
+  }
+  intvec_t operator<<(intvec_t n) const {
+    intvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      _mm_store_ps(p, v);
+    return r;
+  }
+  intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+  intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec_t const &x) const { return !(*this != x); }
+  boolvec_t operator!=(intvec_t const &x) const {
+    return (*this ^ x).convert_bool();
+  }
+  boolvec_t operator<(intvec_t const &x) const {
+    // return (*this - x).as_bool();
+    boolvec_t r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] < x[i]);
     }
-    void storeu(real_t* p) const
-    {
-      return _mm_storeu_ps(p, v);
+    return r;
+  }
+  boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<float, 4> : floatprops<float> {
+  static int const size = 4;
+  typedef real_t scalar_t;
+  typedef __m128 vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<SSE2:4*float>"; }
+  void barrier() { __asm__("" : "+x"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(_mm_set1_ps(a)) {}
+  realvec(real_t const *as) : v(_mm_set_ps(as[3], as[2], as[1], as[0])) {}
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return _mm_load_ps(p);
+  }
+  static realvec_t loadu(real_t const *p) { return _mm_loadu_ps(p); }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    if (ioff == 0)
+      return loada(p);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    _mm_store_ps(p, v);
+  }
+  void storeu(real_t *p) const { return _mm_storeu_ps(p, v); }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
 #if defined __AVX__
-        _mm_maskstore_ps(p, m.m.as_int(), v);
+      _mm_maskstore_ps(p, m.m.as_int(), v);
 #else
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
 #endif
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-        if (m.m[2]) p[2] = (*this)[2];
-        if (m.m[3]) p[3] = (*this)[3];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return _mm_castps_si128(v); }
-    intvec_t convert_int() const { return _mm_cvttps_epi32(v); }
-    
-    
-    
-    realvec_t operator+() const { return *this; }
-    realvec_t operator-() const { return RV(0.0) - *this; }
-    
-    realvec_t operator+(realvec_t x) const { return _mm_add_ps(v, x.v); }
-    realvec_t operator-(realvec_t x) const { return _mm_sub_ps(v, x.v); }
-    realvec_t operator*(realvec_t x) const { return _mm_mul_ps(v, x.v); }
-    realvec_t operator/(realvec_t x) const { return _mm_div_ps(v, x.v); }
-    
-    realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
-    realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
-    realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
-    realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
-      //                      vml_std::fmax((*this)[2], (*this)[3]));
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
-      realvec_t y0022 = x0123.fmax(x1032);
-      return vml_std::fmax(y0022[0], y0022[2]);
-    }
-    real_t minval() const
-    {
-      // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
-      //                      vml_std::fmin((*this)[2], (*this)[3]));
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
-      realvec_t y0022 = x0123.fmin(x1032);
-      return vml_std::fmin(y0022[0], y0022[2]);
     }
-    real_t prod() const
-    {
-      // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
-      realvec_t y0022 = x0123 * x1032;
-      return y0022[0] * y0022[2];
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
+      if (m.m[2])
+        p[2] = (*this)[2];
+      if (m.m[3])
+        p[3] = (*this)[3];
     }
-    real_t sum() const
-    {
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return _mm_castps_si128(v); }
+  intvec_t convert_int() const { return _mm_cvttps_epi32(v); }
+
+  realvec_t operator+() const { return *this; }
+  realvec_t operator-() const { return RV(0.0) - *this; }
+
+  realvec_t operator+(realvec_t x) const { return _mm_add_ps(v, x.v); }
+  realvec_t operator-(realvec_t x) const { return _mm_sub_ps(v, x.v); }
+  realvec_t operator*(realvec_t x) const { return _mm_mul_ps(v, x.v); }
+  realvec_t operator/(realvec_t x) const { return _mm_div_ps(v, x.v); }
+
+  realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+  realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+  realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+  realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+  real_t maxval() const {
+    // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+    //                      vml_std::fmax((*this)[2], (*this)[3]));
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+    realvec_t y0022 = x0123.fmax(x1032);
+    return vml_std::fmax(y0022[0], y0022[2]);
+  }
+  real_t minval() const {
+    // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+    //                      vml_std::fmin((*this)[2], (*this)[3]));
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+    realvec_t y0022 = x0123.fmin(x1032);
+    return vml_std::fmin(y0022[0], y0022[2]);
+  }
+  real_t prod() const {
+    // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+    realvec_t y0022 = x0123 * x1032;
+    return y0022[0] * y0022[2];
+  }
+  real_t sum() const {
 #ifdef __SSE3__
-      realvec_t x = *this;
-      x = _mm_hadd_ps(x.v, x.v);
-      x = _mm_hadd_ps(x.v, x.v);
-      return x[0];
+    realvec_t x = *this;
+    x = _mm_hadd_ps(x.v, x.v);
+    x = _mm_hadd_ps(x.v, x.v);
+    return x[0];
 #else
-      // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
-      realvec_t x0123 = *this;
-      realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
-      realvec_t y0022 = x0123 + x1032;
-      return y0022[0] + y0022[2];
+    // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+    realvec_t x0123 = *this;
+    realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+    realvec_t y0022 = x0123 + x1032;
+    return y0022[0] + y0022[2];
 #endif
-    }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const
-    {
-      return _mm_cmpeq_ps(v, x.v);
-    }
-    boolvec_t operator!=(realvec_t const& x) const
-    {
-      return _mm_cmpneq_ps(v, x.v);
-    }
-    boolvec_t operator<(realvec_t const& x) const
-    {
-      return _mm_cmplt_ps(v, x.v);
-    }
-    boolvec_t operator<=(realvec_t const& x) const
-    {
-      return _mm_cmple_ps(v, x.v);
-    }
-    boolvec_t operator>(realvec_t const& x) const
-    {
-      return _mm_cmpgt_ps(v, x.v);
-    }
-    boolvec_t operator>=(realvec_t const& x) const
-    {
-      return _mm_cmpge_ps(v, x.v);
-    }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const
-    {
+  }
+
+  boolvec_t operator==(realvec_t const &x) const {
+    return _mm_cmpeq_ps(v, x.v);
+  }
+  boolvec_t operator!=(realvec_t const &x) const {
+    return _mm_cmpneq_ps(v, x.v);
+  }
+  boolvec_t operator<(realvec_t const &x) const { return _mm_cmplt_ps(v, x.v); }
+  boolvec_t operator<=(realvec_t const &x) const {
+    return _mm_cmple_ps(v, x.v);
+  }
+  boolvec_t operator>(realvec_t const &x) const { return _mm_cmpgt_ps(v, x.v); }
+  boolvec_t operator>=(realvec_t const &x) const {
+    return _mm_cmpge_ps(v, x.v);
+  }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const {
 #ifdef __SSE4_1__
-      return _mm_ceil_ps(v);
+    return _mm_ceil_ps(v);
 #else
-      return MF::vml_ceil(*this);
+    return MF::vml_ceil(*this);
 #endif
-    }
-    realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return MF::vml_fabs(*this); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const
-    {
+  }
+  realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return MF::vml_fabs(*this); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const {
 #ifdef __SSE4_1__
-      return _mm_floor_ps(v);
+    return _mm_floor_ps(v);
 #else
-      return MF::vml_floor(*this);
+    return MF::vml_floor(*this);
 #endif
-    }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
-    }
-    realvec_t fmax(realvec_t y) const { return _mm_max_ps(v, y.v); }
-    realvec_t fmin(realvec_t y) const { return _mm_min_ps(v, y.v); }
-    realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
-    realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const
-    {
+  }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return _mm_max_ps(v, y.v); }
+  realvec_t fmin(realvec_t y) const { return _mm_min_ps(v, y.v); }
+  realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+  realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const {
 #if defined VML_HAVE_NAN
-      return _mm_cmpunord_ps(v, v);
+    return _mm_cmpunord_ps(v, v);
 #else
-      return BV(false);
+    return BV(false);
 #endif
-    }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const
-    {
-      realvec_t x = *this;
-      realvec_t r = _mm_rcp_ps(x); // this is only an approximation
-      r *= RV(2.0) - r*x;        // one Newton iteration (see vml_rcp)
-      return r;
-    }
-    realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
-    realvec_t rint() const
-    {
+  }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const {
+    realvec_t x = *this;
+    realvec_t r = _mm_rcp_ps(x); // this is only an approximation
+    r *= RV(2.0) - r * x;        // one Newton iteration (see vml_rcp)
+    return r;
+  }
+  realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+  realvec_t rint() const {
 #ifdef __SSE4_1__
-      return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
+    return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
 #else
-      return MF::vml_rint(*this);
+    return MF::vml_rint(*this);
 #endif
-    }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const
-    {
-      realvec_t x = *this;
-      realvec_t r = _mm_rsqrt_ps(x);    // this is only an approximation
-      r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt)
-      return r;
-    }
-    boolvec_t signbit() const { return v; }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const { return _mm_sqrt_ps(v); }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const
-    {
+  }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const {
+    realvec_t x = *this;
+    realvec_t r = _mm_rsqrt_ps(x);      // this is only an approximation
+    r *= RV(1.5) - RV(0.5) * x * r * r; // one Newton iteration (see vml_rsqrt)
+    return r;
+  }
+  boolvec_t signbit() const { return v; }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const { return _mm_sqrt_ps(v); }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const {
 #ifdef __SSE4_1__
-      return _mm_round_ps(v, _MM_FROUND_TO_ZERO);
+    return _mm_round_ps(v, _MM_FROUND_TO_ZERO);
 #else
-      return MF::vml_trunc(*this);
+    return MF::vml_trunc(*this);
 #endif
-    }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<float,4> boolvec<float,4>::as_int() const
-  {
-    return _mm_castps_si128(v);
-  }
-  
-  inline intvec<float,4> boolvec<float,4>::convert_int() const
-  {
-    return lsr(as_int(), bits-1);
-  }
-  
-  inline
-  boolvec<float,4> boolvec<float,4>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return ifthen(x.as_int(), y.as_int()).as_bool();
-  }
-  
-  inline intvec<float,4> boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return ifthen(x.as_float(), y.as_float()).as_int();
-  }
-  
-  inline
-  realvec<float,4> boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const
-  {
+  }
+};
+
+// boolvec definitions
+
+inline intvec<float, 4> boolvec<float, 4>::as_int() const {
+  return _mm_castps_si128(v);
+}
+
+inline intvec<float, 4> boolvec<float, 4>::convert_int() const {
+  return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<float, 4> boolvec<float, 4>::ifthen(boolvec_t x,
+                                                   boolvec_t y) const {
+  return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<float, 4> boolvec<float, 4>::ifthen(intvec_t x,
+                                                  intvec_t y) const {
+  return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<float, 4> boolvec<float, 4>::ifthen(realvec_t x,
+                                                   realvec_t y) const {
 #ifdef __SSE4_1__
-    return _mm_blendv_ps(y.v, x.v, v);
+  return _mm_blendv_ps(y.v, x.v, v);
 #else
-    return (( -convert_int() & x.as_int()) |
-            (~-convert_int() & y.as_int())).as_float();
+  return ((-convert_int() & x.as_int()) | (~ - convert_int() & y.as_int()))
+      .as_float();
 #endif
-  }
+}
+
+// intvec definitions
 
-  
-  
-  // intvec definitions
-  
-  inline intvec<float,4> intvec<float,4>::abs() const
-  {
+inline intvec<float, 4> intvec<float, 4>::abs() const {
 #ifdef __SSSE3__
-    return _mm_abs_epi32(v);
+  return _mm_abs_epi32(v);
 #else
-    return MF::vml_abs(*this);
+  return MF::vml_abs(*this);
 #endif
-  }
-  
-  inline realvec<float,4> intvec<float,4>::as_float() const
-  {
-    return _mm_castsi128_ps(v);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::bitifthen(intvec_t x,
-                                                    intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline realvec<float,4> intvec<float,4>::convert_float() const
-  {
-    return _mm_cvtepi32_ps(v);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::max(intvec_t x) const
-  {
+}
+
+inline realvec<float, 4> intvec<float, 4>::as_float() const {
+  return _mm_castsi128_ps(v);
+}
+
+inline intvec<float, 4> intvec<float, 4>::bitifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<float, 4> intvec<float, 4>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline realvec<float, 4> intvec<float, 4>::convert_float() const {
+  return _mm_cvtepi32_ps(v);
+}
+
+inline intvec<float, 4> intvec<float, 4>::max(intvec_t x) const {
 #ifdef __SSE4_1__
-      return _mm_max_epi32(v, x.v);
+  return _mm_max_epi32(v, x.v);
 #else
-      return MF::vml_max(*this, x);
+  return MF::vml_max(*this, x);
 #endif
-  }
-  
-  inline intvec<float,4> intvec<float,4>::min(intvec_t x) const
-  {
+}
+
+inline intvec<float, 4> intvec<float, 4>::min(intvec_t x) const {
 #ifdef __SSE4_1__
-      return _mm_min_epi32(v, x.v);
+  return _mm_min_epi32(v, x.v);
 #else
-      return MF::vml_min(*this, x);
+  return MF::vml_min(*this, x);
 #endif
-  }
-  
-  inline intvec<float,4> intvec<float,4>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<float,4> intvec<float,4>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+}
+
+inline intvec<float, 4> intvec<float, 4>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_SSE_FLOAT4_H
+#endif // #ifndef VEC_SSE_FLOAT4_H
diff --git a/vec_test.h b/vec_test.h
index 46fc9d1..c27b75e 100644
--- a/vec_test.h
+++ b/vec_test.h
@@ -9,1474 +9,1280 @@
 
 #include <cmath>
 #ifndef VML_NO_IOSTREAM
-#  include <sstream>
+#include <sstream>
 #endif
 
+namespace vecmathlib {
 
+template <typename T, int N> struct booltestvec;
+template <typename T, int N> struct inttestvec;
+template <typename T, int N> struct realtestvec;
+
+template <typename T, int N> struct booltestvec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static int const size = N;
+  typedef bool scalar_t;
+  typedef bool bvector_t[size];
+  static int const alignment = sizeof(bool);
+
+  typedef booltestvec boolvec_t;
+  typedef inttestvec<real_t, size> intvec_t;
+  typedef realtestvec<real_t, size> realvec_t;
+
+  // short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  booltestvec() {}
+  // can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // booltestvec(booltestvec const& x): v(x.v) {}
+  // booltestvec& operator=(booltestvec const& x) { return v=x.v, *this; }
+  // booltestvec(vector_t x): v(x) {}
+  booltestvec(bool a) {
+    for (int d = 0; d < size; ++d)
+      v[d] = a;
+  }
+  booltestvec(bool const *as) {
+    for (int d = 0; d < size; ++d)
+      v[d] = as[d];
+  }
+
+  bool operator[](int n) const { return v[n]; }
+  boolvec_t &set_elt(int n, bool a) { return v[n] = a, *this; }
+
+  intvec_t as_int() const;      // defined after inttestvec
+  intvec_t convert_int() const; // defined after inttestvec
+
+  boolvec_t operator!() const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = !v[d];
+    return res;
+  }
+
+  boolvec_t operator&&(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] && x.v[d];
+    return res;
+  }
+  boolvec_t operator||(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] || x.v[d];
+    return res;
+  }
+  boolvec_t operator==(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] == x.v[d];
+    return res;
+  }
+  boolvec_t operator!=(boolvec_t x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] != x.v[d];
+    return res;
+  }
+
+  bool all() const {
+    bool res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = res && v[d];
+    return res;
+  }
+  bool any() const {
+    bool res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = res || v[d];
+    return res;
+  }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after inttestvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realtestvec
+};
+
+template <typename T, int N> struct inttestvec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static int const size = N;
+  typedef int_t scalar_t;
+  typedef int_t ivector_t[size];
+  static int const alignment = sizeof(int_t);
+
+  typedef booltestvec<real_t, size> boolvec_t;
+  typedef inttestvec intvec_t;
+  typedef realtestvec<real_t, size> realvec_t;
+
+  // short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  inttestvec() {}
+  // can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // inttestvec(inttestvec const& x): v(x.v) {}
+  // inttestvec& operator=(inttestvec const& x) { return v=x.v, *this; }
+  // inttestvec(vector_t x): v(x) {}
+  inttestvec(int_t a) {
+    for (int d = 0; d < size; ++d)
+      v[d] = a;
+  }
+  inttestvec(int_t const *as) {
+    for (int d = 0; d < size; ++d)
+      v[d] = as[d];
+  }
+  static intvec_t iota() {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = d;
+    return res;
+  }
+
+  int_t operator[](int n) const { return v[n]; }
+  intvec_t &set_elt(int n, int_t a) { return v[n] = a, *this; }
+
+  boolvec_t as_bool() const { return convert_bool(); }
+  boolvec_t convert_bool() const {
+    // result: convert_bool(0)=false, convert_bool(else)=true
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d];
+    return res;
+  }
+  realvec_t as_float() const;      // defined after realtestvec
+  realvec_t convert_float() const; // defined after realtestvec
+
+  intvec_t operator+() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = +v[d];
+    return res;
+  }
+  intvec_t operator-() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = -v[d];
+    return res;
+  }
+
+  intvec_t &operator+=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] += x.v[d];
+    return *this;
+  }
+  intvec_t &operator-=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] -= x.v[d];
+    return *this;
+  }
+  intvec_t &operator*=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] *= x.v[d];
+    return *this;
+  }
+  intvec_t &operator/=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] /= x.v[d];
+    return *this;
+  }
+  intvec_t &operator%=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] %= x.v[d];
+    return *this;
+  }
+
+  intvec_t operator+(intvec_t x) const {
+    intvec_t res = *this;
+    return res += x;
+  }
+  intvec_t operator-(intvec_t x) const {
+    intvec_t res = *this;
+    return res -= x;
+  }
+  intvec_t operator*(intvec_t x) const {
+    intvec_t res = *this;
+    return res *= x;
+  }
+  intvec_t operator/(intvec_t x) const {
+    intvec_t res = *this;
+    return res /= x;
+  }
+  intvec_t operator%(intvec_t x) const {
+    intvec_t res = *this;
+    return res %= x;
+  }
+
+  intvec_t operator~() const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = ~v[d];
+    return res;
+  }
+
+  intvec_t &operator&=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] &= x.v[d];
+    return *this;
+  }
+  intvec_t &operator|=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] |= x.v[d];
+    return *this;
+  }
+  intvec_t &operator^=(intvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] ^= x.v[d];
+    return *this;
+  }
+
+  intvec_t operator&(intvec_t x) const {
+    intvec_t res = *this;
+    return res &= x;
+  }
+  intvec_t operator|(intvec_t x) const {
+    intvec_t res = *this;
+    return res |= x;
+  }
+  intvec_t operator^(intvec_t x) const {
+    intvec_t res = *this;
+    return res ^= x;
+  }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const {
+    return MF::vml_bitifthen(*this, x, y);
+  }
+
+  intvec_t lsr(int_t n) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = I(U(v[d]) >> U(n));
+    return res;
+  }
+  intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); }
+  intvec_t &operator>>=(int_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] >>= n;
+    return *this;
+  }
+  intvec_t &operator<<=(int_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] <<= n;
+    return *this;
+  }
+  intvec_t operator>>(int_t n) const {
+    intvec_t res = *this;
+    return res >>= n;
+  }
+  intvec_t operator<<(int_t n) const {
+    intvec_t res = *this;
+    return res <<= n;
+  }
+
+  intvec_t lsr(intvec_t n) const {
+    intvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = I(U(v[d]) >> U(n.v[d]));
+    return res;
+  }
+  intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); }
+  intvec_t &operator>>=(intvec_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] >>= n.v[d];
+    return *this;
+  }
+  intvec_t &operator<<=(intvec_t n) {
+    for (int d = 0; d < size; ++d)
+      v[d] <<= n.v[d];
+    return *this;
+  }
+  intvec_t operator>>(intvec_t n) const {
+    intvec_t res = *this;
+    return res >>= n;
+  }
+  intvec_t operator<<(intvec_t n) const {
+    intvec_t res = *this;
+    return res <<= n;
+  }
+
+  intvec_t clz() const { return MF::vml_clz(*this); }
+  intvec_t popcount() const { return MF::vml_popcount(*this); }
+
+  boolvec_t operator==(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] == x.v[d];
+    return res;
+  }
+  boolvec_t operator!=(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] != x.v[d];
+    return res;
+  }
+  boolvec_t operator<(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] < x.v[d];
+    return res;
+  }
+  boolvec_t operator<=(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] <= x.v[d];
+    return res;
+  }
+  boolvec_t operator>(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] > x.v[d];
+    return res;
+  }
+  boolvec_t operator>=(intvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] >= x.v[d];
+    return res;
+  }
+
+  intvec_t abs() const { return MF::vml_abs(*this); }
+  boolvec_t isignbit() const { return MF::vml_isignbit(*this); }
+  intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); }
+  intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); }
+};
+
+template <typename T, int N> struct realtestvec : floatprops<T> {
+  typedef typename floatprops<T>::int_t int_t;
+  typedef typename floatprops<T>::uint_t uint_t;
+  typedef typename floatprops<T>::real_t real_t;
+
+  static int const size = N;
+  typedef real_t scalar_t;
+  typedef real_t vector_t[size];
+  static int const alignment = sizeof(real_t);
 
-namespace vecmathlib {
-  
-  template<typename T, int N> struct booltestvec;
-  template<typename T, int N> struct inttestvec;
-  template<typename T, int N> struct realtestvec;
-  
-  
-  
-  template<typename T, int N>
-  struct booltestvec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static int const size = N;
-    typedef bool scalar_t;
-    typedef bool bvector_t[size];
-    static int const alignment = sizeof(bool);
-    
-    typedef booltestvec boolvec_t;
-    typedef inttestvec<real_t, size> intvec_t;
-    typedef realtestvec<real_t, size> realvec_t;
-    
-    // short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    booltestvec() {}
-    // can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // booltestvec(booltestvec const& x): v(x.v) {}
-    // booltestvec& operator=(booltestvec const& x) { return v=x.v, *this; }
-    //booltestvec(vector_t x): v(x) {}
-    booltestvec(bool a) { for (int d=0; d<size; ++d) v[d]=a; }
-    booltestvec(bool const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-    
-    bool operator[](int n) const { return v[n]; }
-    boolvec_t& set_elt(int n, bool a) { return v[n]=a, *this; }
-    
-    
-    
-    intvec_t as_int() const;      // defined after inttestvec
-    intvec_t convert_int() const; // defined after inttestvec
-    
-    
-    
-    boolvec_t operator!() const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = !v[d];
-      return res;
-    }
-    
-    boolvec_t operator&&(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] && x.v[d];
-      return res;
-    }
-    boolvec_t operator||(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] || x.v[d];
-      return res;
-    }
-    boolvec_t operator==(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
-      return res;
-    }
-    boolvec_t operator!=(boolvec_t x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
-      return res;
-    }
-    
-    bool all() const
-    {
-      bool res = v[0];
-      for (int d=1; d<size; ++d) res = res && v[d];
-      return res;
-    }
-    bool any() const
-    {
-      bool res = v[0];
-      for (int d=1; d<size; ++d) res = res || v[d];
-      return res;
-    }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after inttestvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realtestvec
-  };
-  
-  
-  
-  template<typename T, int N>
-  struct inttestvec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static int const size = N;
-    typedef int_t scalar_t;
-    typedef int_t ivector_t[size];
-    static int const alignment = sizeof(int_t);
-    
-    typedef booltestvec<real_t, size> boolvec_t;
-    typedef inttestvec intvec_t;
-    typedef realtestvec<real_t, size> realvec_t;
-    
-    // short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    inttestvec() {}
-    // can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // inttestvec(inttestvec const& x): v(x.v) {}
-    // inttestvec& operator=(inttestvec const& x) { return v=x.v, *this; }
-    //inttestvec(vector_t x): v(x) {}
-    inttestvec(int_t a) { for (int d=0; d<size; ++d) v[d]=a; }
-    inttestvec(int_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-    static intvec_t iota()
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d]=d;
-      return res;
-    }
-    
-    int_t operator[](int n) const { return v[n]; }
-    intvec_t& set_elt(int n, int_t a) { return v[n]=a, *this; }
-    
-    
-    
-    boolvec_t as_bool() const { return convert_bool(); }
-    boolvec_t convert_bool() const
-    {
-      // result: convert_bool(0)=false, convert_bool(else)=true
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d]=v[d];
-      return res;
-    }
-    realvec_t as_float() const;      // defined after realtestvec
-    realvec_t convert_float() const; // defined after realtestvec
-    
-    
-    
-    intvec_t operator+() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = + v[d];
-      return res;
-    }
-    intvec_t operator-() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = - v[d];
-      return res;
-    }
-    
-    intvec_t& operator+=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] += x.v[d];
-      return *this;
-    }
-    intvec_t& operator-=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] -= x.v[d];
-      return *this;
-    }
-    intvec_t& operator*=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] *= x.v[d];
-      return *this;
-    }
-    intvec_t& operator/=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] /= x.v[d];
-      return *this;
-    }
-    intvec_t& operator%=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] %= x.v[d];
-      return *this;
-    }
-    
-    intvec_t operator+(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res += x;
-    }
-    intvec_t operator-(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res -= x;
-    }
-    intvec_t operator*(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res *= x;
-    }
-    intvec_t operator/(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res /= x;
-    }
-    intvec_t operator%(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res %= x;
-    }
-    
-    
-    
-    intvec_t operator~() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = ~ v[d];
-      return res;
-    }
-    
-    intvec_t& operator&=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] &= x.v[d];
-      return *this;
-    }
-    intvec_t& operator|=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] |= x.v[d];
-      return *this;
-    }
-    intvec_t& operator^=(intvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] ^= x.v[d];
-      return *this;
-    }
-    
-    intvec_t operator&(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res &= x;
-    }
-    intvec_t operator|(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res |= x;
-    }
-    intvec_t operator^(intvec_t x) const
-    {
-      intvec_t res = *this;
-      return res ^= x;
-    }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const
-    {
-      return MF::vml_bitifthen(*this, x, y);
-    }
-    
-    
-    
-    intvec_t lsr(int_t n) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n));
-      return res;
-    }
-    intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); }
-    intvec_t& operator>>=(int_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] >>= n;
-      return *this;
-    }
-    intvec_t& operator<<=(int_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] <<= n;
-      return *this;
-    }
-    intvec_t operator>>(int_t n) const
-    {
-      intvec_t res = *this;
-      return res >>= n;
-    }
-    intvec_t operator<<(int_t n) const
-    {
-      intvec_t res = *this;
-      return res <<= n;
-    }
-    
-    intvec_t lsr(intvec_t n) const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n.v[d]));
-      return res;
-    }
-    intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); }
-    intvec_t& operator>>=(intvec_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] >>= n.v[d];
-      return *this;
-    }
-    intvec_t& operator<<=(intvec_t n)
-    {
-      for (int d=0; d<size; ++d) v[d] <<= n.v[d];
-      return *this;
-    }
-    intvec_t operator>>(intvec_t n) const
-    {
-      intvec_t res = *this;
-      return res >>= n;
-    }
-    intvec_t operator<<(intvec_t n) const
-    {
-      intvec_t res = *this;
-      return res <<= n;
-    }
-    
-    intvec_t clz() const { return MF::vml_clz(*this); }
-    intvec_t popcount() const { return MF::vml_popcount(*this); }
-    
-    
-    
-    boolvec_t operator==(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
-      return res;
-    }
-    boolvec_t operator!=(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
-      return res;
-    }
-    boolvec_t operator<(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d];
-      return res;
-    }
-    boolvec_t operator<=(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d];
-      return res;
-    }
-    boolvec_t operator>(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d];
-      return res;
-    }
-    boolvec_t operator>=(intvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d];
-      return res;
-    }
-    
-    intvec_t abs() const { return MF::vml_abs(*this); }
-    boolvec_t isignbit() const { return MF::vml_isignbit(*this); }
-    intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); }
-    intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); }
-  };
-  
-  
-  
-  template<typename T, int N>
-  struct realtestvec: floatprops<T>
-  {
-    typedef typename floatprops<T>::int_t int_t;
-    typedef typename floatprops<T>::uint_t uint_t;
-    typedef typename floatprops<T>::real_t real_t;
-    
-    static int const size = N;
-    typedef real_t scalar_t;
-    typedef real_t vector_t[size];
-    static int const alignment = sizeof(real_t);
-    
 #ifndef VML_NO_IOSTREAM
-    static char const* name()
-    {
-      static std::string name_;
-      if (name_.empty()) {
-        std::stringstream buf;
-        buf << "<VML:" << N << "*" << FP::name() << ">";
-        name_ = buf.str();
-      }
-      return name_.c_str();
+  static char const *name() {
+    static std::string name_;
+    if (name_.empty()) {
+      std::stringstream buf;
+      buf << "<VML:" << N << "*" << FP::name() << ">";
+      name_ = buf.str();
     }
+    return name_.c_str();
+  }
 #endif
-    void barrier()
-    {
+  void barrier() {
 #if defined __GNUC__ && !defined __clang__ && !defined __ICC
-      // GCC crashes when +X is used as constraint
-#  if defined __SSE2__
-      for (int d=0; d<size; ++d) __asm__("": "+x"(v[d]));
-#  elif defined __PPC64__       // maybe also __PPC__
-      for (int d=0; d<size; ++d) __asm__("": "+f"(v[d]));
-#  elif defined __arm__
-      for (int d=0; d<size; ++d) __asm__("": "+w"(v[d]));
-#  else
-#    error "Floating point barrier undefined on this architecture"
-#  endif
+// GCC crashes when +X is used as constraint
+#if defined __SSE2__
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+x"(v[d]));
+#elif defined __PPC64__ // maybe also __PPC__
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+f"(v[d]));
+#elif defined __arm__
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+w"(v[d]));
+#else
+#error "Floating point barrier undefined on this architecture"
+#endif
 #elif defined __clang__
-      for (int d=0; d<size; ++d) __asm__("": "+X"(v[d]));
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+X"(v[d]));
 #elif defined __ICC
-      for (int d=0; d<size; ++d) {
-        real_t tmp = v[d];
-        __asm__("": "+X"(tmp));
-        v[d] = tmp;
-      }
+    for (int d = 0; d < size; ++d) {
+      real_t tmp = v[d];
+      __asm__("" : "+X"(tmp));
+      v[d] = tmp;
+    }
 #elif defined __IBMCPP__
-      for (int d=0; d<size; ++d) __asm__("": "+f"(v[d]));
+    for (int d = 0; d < size; ++d)
+      __asm__("" : "+f"(v[d]));
 #else
-#  error "Floating point barrier undefined on this architecture"
+#error "Floating point barrier undefined on this architecture"
 #endif
-    }
-    
-    typedef booltestvec<real_t, size> boolvec_t;
-    typedef inttestvec<real_t, size> intvec_t;
-    typedef realtestvec realvec_t;
-    
-    // short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realtestvec() {}
-    // can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realtestvec(realtestvec const& x): v(x.v) {}
-    // realtestvec& operator=(realtestvec const& x) { return v=x.v, *this; }
-    //realtestvec(vector_t x): v(x) {}
-    realtestvec(real_t a) { for (int d=0; d<size; ++d) v[d]=a; }
-    realtestvec(real_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-    
-    real_t operator[](int n) const { return v[n]; }
-    realvec_t& set_elt(int n, real_t a) { return v[n]=a, *this; }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loadu(p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = p[d];
-      return res;
-    }
-    static realvec_t loadu(real_t const* p, size_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      return m.m.ifthen(loada(p), *this);
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      return m.m.ifthen(loadu(p), *this);
-    }
-    realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
-    {
-      return m.m.ifthen(loadu(p, ioff), *this);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p);
-    }
-    void storeu(real_t* p) const
-    {
-      for (int d=0; d<size; ++d) p[d] = v[d];
-    }
-    void storeu(real_t* p, size_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p, m);
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      for (int d=0; d<size; ++d) if (m.m[d]) p[d] = v[d];
-    }
-    void storeu(real_t* p, size_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const
-    {
-      intvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = FP::as_int(v[d]);
-      return res;
-    }
-    intvec_t convert_int() const { return MF::vml_convert_int(*this); }
-    
-    
-    
-    realvec_t operator+() const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = + v[d];
-      return res;
-    }
-    realvec_t operator-() const
-    {
-      realvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = - v[d];
-      return res;
-    }
-    
-    realvec_t& operator+=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] += x.v[d];
-      return *this;
-    }
-    realvec_t& operator-=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] -= x.v[d];
-      return *this;
-    }
-    realvec_t& operator*=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] *= x.v[d];
-      return *this;
-    }
-    realvec_t& operator/=(realvec_t const& x)
-    {
-      for (int d=0; d<size; ++d) v[d] /= x.v[d];
-      return *this;
-    }
-    
-    realvec_t operator+(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res += x;
-    }
-    realvec_t operator-(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res -= x;
-    }
-    realvec_t operator*(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res *= x;
-    }
-    realvec_t operator/(realvec_t x) const
-    {
-      realvec_t res = *this;
-      return res /= x;
-    }
-    
-    real_t maxval() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res = vml_std::fmax(res, v[d]);
-      return res;
-    }
-    real_t minval() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res = vml_std::fmin(res, v[d]);
-      return res;
-    }
-    real_t prod() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res *= v[d];
-      return res;
-    }
-    real_t sum() const
-    {
-      real_t res = v[0];
-      for (int d=1; d<size; ++d) res += v[d];
-      return res;
-    }
-    
-    
-    
-    boolvec_t operator==(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
-      return res;
-    }
-    boolvec_t operator!=(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
-      return res;
-    }
-    boolvec_t operator<(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d];
-      return res;
-    }
-    boolvec_t operator<=(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d];
-      return res;
-    }
-    boolvec_t operator>(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d];
-      return res;
-    }
-    boolvec_t operator>=(realvec_t const& x) const
-    {
-      boolvec_t res;
-      for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d];
-      return res;
-    }
-    
-    
-    
-    realvec_t acos() const { return MF::vml_acos(*this); }
-    realvec_t acosh() const { return MF::vml_acosh(*this); }
-    realvec_t asin() const { return MF::vml_asin(*this); }
-    realvec_t asinh() const { return MF::vml_asinh(*this); }
-    realvec_t atan() const { return MF::vml_atan(*this); }
-    realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
-    realvec_t atanh() const { return MF::vml_atanh(*this); }
-    realvec_t cbrt() const { return MF::vml_cbrt(*this); }
-    realvec_t ceil() const { return MF::vml_ceil(*this); }
-    realvec_t copysign(realvec_t y) const
-    {
-      return MF::vml_copysign(*this, y);
-    }
-    realvec_t cos() const { return MF::vml_cos(*this); }
-    realvec_t cosh() const { return MF::vml_cosh(*this); }
-    realvec_t exp() const { return MF::vml_exp(*this); }
-    realvec_t exp10() const { return MF::vml_exp10(*this); }
-    realvec_t exp2() const { return MF::vml_exp2(*this); }
-    realvec_t expm1() const { return MF::vml_expm1(*this); }
-    realvec_t fabs() const { return MF::vml_fabs(*this); }
-    realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
-    realvec_t floor() const { return MF::vml_floor(*this); }
-    realvec_t fma(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_fma(*this, y, z);
-    }
-    realvec_t fmax(realvec_t y) const { return MF::vml_fmax(*this, y); }
-    realvec_t fmin(realvec_t y) const { return MF::vml_fmin(*this, y); }
-    realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
-    realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec_t log() const { return MF::vml_log(*this); }
-    realvec_t log10() const { return MF::vml_log10(*this); }
-    realvec_t log1p() const { return MF::vml_log1p(*this); }
-    realvec_t log2() const { return MF::vml_log2(*this); }
-    intvec_t lrint() const { return MF::vml_lrint(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
-    }
-    realvec_t nextafter(realvec_t y) const
-    {
-      return MF::vml_nextafter(*this, y);
-    }
-    realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
-    realvec_t rcp() const { return MF::vml_rcp(*this); }
-    realvec_t remainder(realvec_t y) const
-    {
-      return MF::vml_remainder(*this, y);
-    }
-    realvec_t rint() const { return MF::vml_rint(*this); }
-    realvec_t round() const { return MF::vml_round(*this); }
-    realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec_t sin() const { return MF::vml_sin(*this); }
-    realvec_t sinh() const { return MF::vml_sinh(*this); }
-    realvec_t sqrt() const { return MF::vml_sqrt(*this); }
-    realvec_t tan() const { return MF::vml_tan(*this); }
-    realvec_t tanh() const { return MF::vml_tanh(*this); }
-    realvec_t trunc() const { return MF::vml_trunc(*this); }
-  };
-  
-  
-  
-  // booltestvec definitions
-  
-  template<typename T, int N>
-  inline
-  typename booltestvec<T,N>::intvec_t
-  booltestvec<T,N>::as_int() const
-  {
-    return convert_int();
-  }
-  
-  template<typename T, int N>
-  inline
-  typename booltestvec<T,N>::intvec_t
-  booltestvec<T,N>::convert_int() const
-  {
-    intvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d];
-    return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename booltestvec<T,N>::boolvec_t
-  booltestvec<T,N>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    boolvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+
+  typedef booltestvec<real_t, size> boolvec_t;
+  typedef inttestvec<real_t, size> intvec_t;
+  typedef realtestvec realvec_t;
+
+  // short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realtestvec() {}
+  // can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realtestvec(realtestvec const& x): v(x.v) {}
+  // realtestvec& operator=(realtestvec const& x) { return v=x.v, *this; }
+  // realtestvec(vector_t x): v(x) {}
+  realtestvec(real_t a) {
+    for (int d = 0; d < size; ++d)
+      v[d] = a;
+  }
+  realtestvec(real_t const *as) {
+    for (int d = 0; d < size; ++d)
+      v[d] = as[d];
+  }
+
+  real_t operator[](int n) const { return v[n]; }
+  realvec_t &set_elt(int n, real_t a) { return v[n] = a, *this; }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loadu(p);
+  }
+  static realvec_t loadu(real_t const *p) {
+    realvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = p[d];
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename booltestvec<T,N>::intvec_t
-  booltestvec<T,N>::ifthen(intvec_t x, intvec_t y) const
-  {
+  static realvec_t loadu(real_t const *p, size_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    return m.m.ifthen(loada(p), *this);
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    return m.m.ifthen(loadu(p), *this);
+  }
+  realvec_t loadu(real_t const *p, size_t ioff, mask_t const &m) const {
+    return m.m.ifthen(loadu(p, ioff), *this);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p);
+  }
+  void storeu(real_t *p) const {
+    for (int d = 0; d < size; ++d)
+      p[d] = v[d];
+  }
+  void storeu(real_t *p, size_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p, m);
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    for (int d = 0; d < size; ++d)
+      if (m.m[d])
+        p[d] = v[d];
+  }
+  void storeu(real_t *p, size_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const {
     intvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+    for (int d = 0; d < size; ++d)
+      res.v[d] = FP::as_int(v[d]);
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename booltestvec<T,N>::realvec_t
-  booltestvec<T,N>::ifthen(realvec_t x, realvec_t y) const
-  {
+  intvec_t convert_int() const { return MF::vml_convert_int(*this); }
+
+  realvec_t operator+() const {
     realvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+    for (int d = 0; d < size; ++d)
+      res.v[d] = +v[d];
     return res;
   }
-
-  
-  
-  // inttestvec definitions
-  
-  template<typename T, int N>
-  inline
-  typename inttestvec<T,N>::realvec_t
-  inttestvec<T,N>::as_float() const
-  {
+  realvec_t operator-() const {
     realvec_t res;
-    for (int d=0; d<size; ++d) res.v[d] = FP::as_float(v[d]);
+    for (int d = 0; d < size; ++d)
+      res.v[d] = -v[d];
     return res;
   }
-  
-  template<typename T, int N>
-  inline
-  typename inttestvec<T,N>::realvec_t
-  inttestvec<T,N>::convert_float() const
-  {
-    return MF::vml_convert_float(*this);
-  }
-  
-
-
-  // Wrappers
-  
-  // booltestvec wrappers
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> as_int(booltestvec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> convert_int(booltestvec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline bool all(booltestvec<real_t, size> x) { return x.all(); }
-  
-  template<typename real_t, int size>
-  inline bool any(booltestvec<real_t, size> x) { return x.any(); }
-  
-  template<typename real_t, int size>
-  inline
-  booltestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
-                                   booltestvec<real_t, size> x,
-                                   booltestvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
-                                  inttestvec<real_t, size> x,
-                                  inttestvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realtestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
-                                   realtestvec<real_t, size> x,
-                                   realtestvec<real_t, size> y)
-  {
-    return c.ifthen(x, y);
-  }
-  
-  
-  
-  // inttestvec wrappers
-  
-  template<typename real_t, int size>
-  inline inttestvec<real_t, size> abs(inttestvec<real_t, size> x)
-  {
-    return x.abs();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> as_bool(inttestvec<real_t, size> x)
-  {
-    return x.as_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> as_float(inttestvec<real_t, size> x)
-  {
-    return x.as_float();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> bitifthen(inttestvec<real_t, size> x,
-                                     inttestvec<real_t, size> y,
-                                     inttestvec<real_t, size> z)
-  {
-    return x.bitifthen(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline inttestvec<real_t, size> clz(inttestvec<real_t, size> x)
-  {
-    return x.clz();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> convert_bool(inttestvec<real_t, size> x)
-  {
-    return x.convert_bool();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> convert_float(inttestvec<real_t, size> x)
-  {
-    return x.convert_float();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> isignbit(inttestvec<real_t, size> x)
-  {
-    return x.isignbit();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> lsr(inttestvec<real_t, size> x,
-                               typename inttestvec<real_t, size>::int_t n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> lsr(inttestvec<real_t, size> x,
-                               inttestvec<real_t, size> n)
-  {
-    return x.lsr(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> max(inttestvec<real_t, size> x,
-                               inttestvec<real_t, size> y)
-  {
-    return x.max(y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> min(inttestvec<real_t, size> x,
-                               inttestvec<real_t, size> y)
-  {
-    return x.min(y);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> popcount(inttestvec<real_t, size> x)
-  {
-    return x.popcount();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> rotate(inttestvec<real_t, size> x,
-                                  typename inttestvec<real_t, size>::int_t n)
-  {
-    return x.rotate(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  inttestvec<real_t, size> rotate(inttestvec<real_t, size> x,
-                                  inttestvec<real_t, size> n)
-  {
-    return x.rotate(n);
-  }
-  
-  
-  
-  // realtestvec wrappers
-  
-  template<typename real_t, int size>
-  inline
-  realtestvec<real_t, size>
-  loada(real_t const* p,
-        realtestvec<real_t, size> x,
-        typename realtestvec<real_t, size>::mask_t const& m)
-  {
-    return x.loada(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size>
-  loadu(real_t const* p,
-        realtestvec<real_t, size> x,
-        typename realtestvec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realtestvec<real_t, size>
-  loadu(real_t const* p, size_t ioff,
-        realtestvec<real_t, size> x,
-        typename realtestvec<real_t, size>::mask_t const& m)
-  {
-    return x.loadu(p, ioff, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realtestvec<real_t, size> x, real_t* p)
-  {
-    return x.storea(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realtestvec<real_t, size> x, real_t* p)
-  {
-    return x.storeu(p);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realtestvec<real_t, size> x, real_t* p, size_t ioff)
-  {
-    return x.storeu(p, ioff);
-  }
-  
-  template<typename real_t, int size>
-  inline void storea(realtestvec<real_t, size> x, real_t* p,
-                     typename realtestvec<real_t, size>::mask_t const& m)
-  {
-    return x.storea(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realtestvec<real_t, size> x, real_t* p,
-                     typename realtestvec<real_t, size>::mask_t const& m)
-  {
-    return x.storeu(p, m);
-  }
-  
-  template<typename real_t, int size>
-  inline void storeu(realtestvec<real_t, size> x, real_t* p, size_t ioff,
-                     typename realtestvec<real_t, size>::mask_t const& m)
-  {
-    return x.storeu(p, ioff, m);
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline inttestvec<real_t, size> as_int(realtestvec<real_t, size> x)
-  {
-    return x.as_int();
-  }
-  
-  template<typename real_t, int size>
-  inline inttestvec<real_t, size> convert_int(realtestvec<real_t, size> x)
-  {
-    return x.convert_int();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t maxval(realtestvec<real_t, size> x)
-  {
-    return x.maxval();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t minval(realtestvec<real_t, size> x)
-  {
-    return x.minval();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t prod(realtestvec<real_t, size> x)
-  {
-    return x.prod();
-  }
-  
-  template<typename real_t, int size>
-  inline real_t sum(realtestvec<real_t, size> x)
-  {
-    return x.sum();
-  }
-  
-  
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> acos(realtestvec<real_t, size> x)
-  {
-    return x.acos();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> acosh(realtestvec<real_t, size> x)
-  {
-    return x.acosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> asin(realtestvec<real_t, size> x)
-  {
-    return x.asin();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> asinh(realtestvec<real_t, size> x)
-  {
-    return x.asinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> atan(realtestvec<real_t, size> x)
-  {
-    return x.atan();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> atan2(realtestvec<real_t, size> x,
-                                         realtestvec<real_t, size> y)
-  {
-    return x.atan2(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> atanh(realtestvec<real_t, size> x)
-  {
-    return x.atanh();
-  }
-    
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> cbrt(realtestvec<real_t, size> x)
-  {
-    return x.cbrt();
-  }
-    
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> ceil(realtestvec<real_t, size> x)
-  {
-    return x.ceil();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> copysign(realtestvec<real_t, size> x,
-                                            realtestvec<real_t, size> y)
-  {
-    return x.copysign(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> cos(realtestvec<real_t, size> x)
-  {
-    return x.cos();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> cosh(realtestvec<real_t, size> x)
-  {
-    return x.cosh();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> exp(realtestvec<real_t, size> x)
-  {
-    return x.exp();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> exp10(realtestvec<real_t, size> x)
-  {
-    return x.exp10();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> exp2(realtestvec<real_t, size> x)
-  {
-    return x.exp2();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> expm1(realtestvec<real_t, size> x)
-  {
-    return x.expm1();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> fabs(realtestvec<real_t, size> x)
-  {
-    return x.fabs();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> floor(realtestvec<real_t, size> x)
-  {
-    return x.floor();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> fdim(realtestvec<real_t, size> x,
-                                        realtestvec<real_t, size> y)
-  {
-    return x.fdim(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> fma(realtestvec<real_t, size> x,
-                                       realtestvec<real_t, size> y,
-                                       realtestvec<real_t, size> z)
-  {
-    return x.fma(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> fmax(realtestvec<real_t, size> x,
-                                        realtestvec<real_t, size> y)
-  {
-    return x.fmax(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> fmin(realtestvec<real_t, size> x,
-                                        realtestvec<real_t, size> y)
-  {
-    return x.fmin(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> fmod(realtestvec<real_t, size> x,
-                                        realtestvec<real_t, size> y)
-  {
-    return x.fmod(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> frexp(realtestvec<real_t, size> x,
-                                         inttestvec<real_t, size>* r)
-  {
-    return x.frexp(r);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> hypot(realtestvec<real_t, size> x,
-                                         realtestvec<real_t, size> y)
-  {
-    return x.hypot(y);
-  }
-  
-  template<typename real_t, int size>
-  inline inttestvec<real_t, size> ilogb(realtestvec<real_t, size> x)
-  {
-    return x.ilogb();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> isfinite(realtestvec<real_t, size> x)
-  {
-    return x.isfinite();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> isinf(realtestvec<real_t, size> x)
-  {
-    return x.isinf();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> isnan(realtestvec<real_t, size> x)
-  {
-    return x.isnan();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> isnormal(realtestvec<real_t, size> x)
-  {
-    return x.isnormal();
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realtestvec<real_t, size> ldexp(realtestvec<real_t, size> x,
-                                  typename inttestvec<real_t, size>::int_t n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline
-  realtestvec<real_t, size> ldexp(realtestvec<real_t, size> x,
-                                  inttestvec<real_t, size> n)
-  {
-    return x.ldexp(n);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> log(realtestvec<real_t, size> x)
-  {
-    return x.log();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> log10(realtestvec<real_t, size> x)
-  {
-    return x.log10();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> log1p(realtestvec<real_t, size> x)
-  {
-    return x.log1p();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> log2(realtestvec<real_t, size> x)
-  {
-    return x.log2();
-  }
-  
-  template<typename real_t, int size>
-  inline inttestvec<real_t, size> lrint(realtestvec<real_t, size> x)
-  {
-    return x.lrint();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> mad(realtestvec<real_t, size> x,
-                                       realtestvec<real_t, size> y,
-                                       realtestvec<real_t, size> z)
-  {
-    return x.mad(y, z);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> nextafter(realtestvec<real_t, size> x,
-                                             realtestvec<real_t, size> y)
-  {
-    return x.nextafter(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> pow(realtestvec<real_t, size> x,
-                                       realtestvec<real_t, size> y)
-  {
-    return x.pow(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> rcp(realtestvec<real_t, size> x)
-  {
-    return x.rcp();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> remainder(realtestvec<real_t, size> x,
-                                             realtestvec<real_t, size> y)
-  {
-    return x.remainder(y);
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> rint(realtestvec<real_t, size> x)
-  {
-    return x.rint();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> round(realtestvec<real_t, size> x)
-  {
-    return x.round();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> rsqrt(realtestvec<real_t, size> x)
-  {
-    return x.rsqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline booltestvec<real_t, size> signbit(realtestvec<real_t, size> x)
-  {
-    return x.signbit();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> sin(realtestvec<real_t, size> x)
-  {
-    return x.sin();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> sinh(realtestvec<real_t, size> x)
-  {
-    return x.sinh();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> sqrt(realtestvec<real_t, size> x)
-  {
-    return x.sqrt();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> tan(realtestvec<real_t, size> x)
-  {
-    return x.tan();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> tanh(realtestvec<real_t, size> x)
-  {
-    return x.tanh();
-  }
-  
-  template<typename real_t, int size>
-  inline realtestvec<real_t, size> trunc(realtestvec<real_t, size> x)
-  {
-    return x.trunc();
-  }
-  
-  
-  
-#ifndef VML_NO_IOSTREAM
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           booltestvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           inttestvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
-  }
-  
-  template<typename real_t, int size>
-  std::ostream& operator<<(std::ostream& os,
-                           realtestvec<real_t, size> const& x)
-  {
-    os << "[";
-    for (int i=0; i<size; ++i) {
-      if (i!=0) os << ",";
-      os << x[i];
-    }
-    os << "]";
-    return os;
+
+  realvec_t &operator+=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] += x.v[d];
+    return *this;
+  }
+  realvec_t &operator-=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] -= x.v[d];
+    return *this;
+  }
+  realvec_t &operator*=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] *= x.v[d];
+    return *this;
+  }
+  realvec_t &operator/=(realvec_t const &x) {
+    for (int d = 0; d < size; ++d)
+      v[d] /= x.v[d];
+    return *this;
   }
+
+  realvec_t operator+(realvec_t x) const {
+    realvec_t res = *this;
+    return res += x;
+  }
+  realvec_t operator-(realvec_t x) const {
+    realvec_t res = *this;
+    return res -= x;
+  }
+  realvec_t operator*(realvec_t x) const {
+    realvec_t res = *this;
+    return res *= x;
+  }
+  realvec_t operator/(realvec_t x) const {
+    realvec_t res = *this;
+    return res /= x;
+  }
+
+  real_t maxval() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = vml_std::fmax(res, v[d]);
+    return res;
+  }
+  real_t minval() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res = vml_std::fmin(res, v[d]);
+    return res;
+  }
+  real_t prod() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res *= v[d];
+    return res;
+  }
+  real_t sum() const {
+    real_t res = v[0];
+    for (int d = 1; d < size; ++d)
+      res += v[d];
+    return res;
+  }
+
+  boolvec_t operator==(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] == x.v[d];
+    return res;
+  }
+  boolvec_t operator!=(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] != x.v[d];
+    return res;
+  }
+  boolvec_t operator<(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] < x.v[d];
+    return res;
+  }
+  boolvec_t operator<=(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] <= x.v[d];
+    return res;
+  }
+  boolvec_t operator>(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] > x.v[d];
+    return res;
+  }
+  boolvec_t operator>=(realvec_t const &x) const {
+    boolvec_t res;
+    for (int d = 0; d < size; ++d)
+      res.v[d] = v[d] >= x.v[d];
+    return res;
+  }
+
+  realvec_t acos() const { return MF::vml_acos(*this); }
+  realvec_t acosh() const { return MF::vml_acosh(*this); }
+  realvec_t asin() const { return MF::vml_asin(*this); }
+  realvec_t asinh() const { return MF::vml_asinh(*this); }
+  realvec_t atan() const { return MF::vml_atan(*this); }
+  realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+  realvec_t atanh() const { return MF::vml_atanh(*this); }
+  realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+  realvec_t ceil() const { return MF::vml_ceil(*this); }
+  realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+  realvec_t cos() const { return MF::vml_cos(*this); }
+  realvec_t cosh() const { return MF::vml_cosh(*this); }
+  realvec_t exp() const { return MF::vml_exp(*this); }
+  realvec_t exp10() const { return MF::vml_exp10(*this); }
+  realvec_t exp2() const { return MF::vml_exp2(*this); }
+  realvec_t expm1() const { return MF::vml_expm1(*this); }
+  realvec_t fabs() const { return MF::vml_fabs(*this); }
+  realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+  realvec_t floor() const { return MF::vml_floor(*this); }
+  realvec_t fma(realvec_t y, realvec_t z) const {
+    return MF::vml_fma(*this, y, z);
+  }
+  realvec_t fmax(realvec_t y) const { return MF::vml_fmax(*this, y); }
+  realvec_t fmin(realvec_t y) const { return MF::vml_fmin(*this, y); }
+  realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+  realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec_t log() const { return MF::vml_log(*this); }
+  realvec_t log10() const { return MF::vml_log10(*this); }
+  realvec_t log1p() const { return MF::vml_log1p(*this); }
+  realvec_t log2() const { return MF::vml_log2(*this); }
+  intvec_t lrint() const { return MF::vml_lrint(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+  realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+  realvec_t rcp() const { return MF::vml_rcp(*this); }
+  realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+  realvec_t rint() const { return MF::vml_rint(*this); }
+  realvec_t round() const { return MF::vml_round(*this); }
+  realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+  boolvec_t signbit() const { return MF::vml_signbit(*this); }
+  realvec_t sin() const { return MF::vml_sin(*this); }
+  realvec_t sinh() const { return MF::vml_sinh(*this); }
+  realvec_t sqrt() const { return MF::vml_sqrt(*this); }
+  realvec_t tan() const { return MF::vml_tan(*this); }
+  realvec_t tanh() const { return MF::vml_tanh(*this); }
+  realvec_t trunc() const { return MF::vml_trunc(*this); }
+};
+
+// booltestvec definitions
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::intvec_t booltestvec<T, N>::as_int() const {
+  return convert_int();
+}
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::intvec_t
+booltestvec<T, N>::convert_int() const {
+  intvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d];
+  return res;
+}
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::boolvec_t
+booltestvec<T, N>::ifthen(boolvec_t x, boolvec_t y) const {
+  boolvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d] ? x.v[d] : y.v[d];
+  return res;
+}
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::intvec_t
+booltestvec<T, N>::ifthen(intvec_t x, intvec_t y) const {
+  intvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d] ? x.v[d] : y.v[d];
+  return res;
+}
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::realvec_t
+booltestvec<T, N>::ifthen(realvec_t x, realvec_t y) const {
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = v[d] ? x.v[d] : y.v[d];
+  return res;
+}
+
+// inttestvec definitions
+
+template <typename T, int N>
+inline typename inttestvec<T, N>::realvec_t inttestvec<T, N>::as_float() const {
+  realvec_t res;
+  for (int d = 0; d < size; ++d)
+    res.v[d] = FP::as_float(v[d]);
+  return res;
+}
+
+template <typename T, int N>
+inline typename inttestvec<T, N>::realvec_t
+inttestvec<T, N>::convert_float() const {
+  return MF::vml_convert_float(*this);
+}
+
+// Wrappers
+
+// booltestvec wrappers
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> as_int(booltestvec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> convert_int(booltestvec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline bool all(booltestvec<real_t, size> x) {
+  return x.all();
+}
+
+template <typename real_t, int size>
+inline bool any(booltestvec<real_t, size> x) {
+  return x.any();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
+                                        booltestvec<real_t, size> x,
+                                        booltestvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
+                                       inttestvec<real_t, size> x,
+                                       inttestvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
+                                        realtestvec<real_t, size> x,
+                                        realtestvec<real_t, size> y) {
+  return c.ifthen(x, y);
+}
+
+// inttestvec wrappers
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> abs(inttestvec<real_t, size> x) {
+  return x.abs();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> as_bool(inttestvec<real_t, size> x) {
+  return x.as_bool();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> as_float(inttestvec<real_t, size> x) {
+  return x.as_float();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> bitifthen(inttestvec<real_t, size> x,
+                                          inttestvec<real_t, size> y,
+                                          inttestvec<real_t, size> z) {
+  return x.bitifthen(y, z);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> clz(inttestvec<real_t, size> x) {
+  return x.clz();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> convert_bool(inttestvec<real_t, size> x) {
+  return x.convert_bool();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> convert_float(inttestvec<real_t, size> x) {
+  return x.convert_float();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isignbit(inttestvec<real_t, size> x) {
+  return x.isignbit();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size>
+lsr(inttestvec<real_t, size> x, typename inttestvec<real_t, size>::int_t n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> lsr(inttestvec<real_t, size> x,
+                                    inttestvec<real_t, size> n) {
+  return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> max(inttestvec<real_t, size> x,
+                                    inttestvec<real_t, size> y) {
+  return x.max(y);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> min(inttestvec<real_t, size> x,
+                                    inttestvec<real_t, size> y) {
+  return x.min(y);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> popcount(inttestvec<real_t, size> x) {
+  return x.popcount();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size>
+rotate(inttestvec<real_t, size> x, typename inttestvec<real_t, size>::int_t n) {
+  return x.rotate(n);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> rotate(inttestvec<real_t, size> x,
+                                       inttestvec<real_t, size> n) {
+  return x.rotate(n);
+}
+
+// realtestvec wrappers
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size>
+loada(real_t const *p, realtestvec<real_t, size> x,
+      typename realtestvec<real_t, size>::mask_t const &m) {
+  return x.loada(p, m);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size>
+loadu(real_t const *p, realtestvec<real_t, size> x,
+      typename realtestvec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, m);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size>
+loadu(real_t const *p, size_t ioff, realtestvec<real_t, size> x,
+      typename realtestvec<real_t, size>::mask_t const &m) {
+  return x.loadu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline void storea(realtestvec<real_t, size> x, real_t *p) {
+  return x.storea(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realtestvec<real_t, size> x, real_t *p) {
+  return x.storeu(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realtestvec<real_t, size> x, real_t *p, size_t ioff) {
+  return x.storeu(p, ioff);
+}
+
+template <typename real_t, int size>
+inline void storea(realtestvec<real_t, size> x, real_t *p,
+                   typename realtestvec<real_t, size>::mask_t const &m) {
+  return x.storea(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realtestvec<real_t, size> x, real_t *p,
+                   typename realtestvec<real_t, size>::mask_t const &m) {
+  return x.storeu(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realtestvec<real_t, size> x, real_t *p, size_t ioff,
+                   typename realtestvec<real_t, size>::mask_t const &m) {
+  return x.storeu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> as_int(realtestvec<real_t, size> x) {
+  return x.as_int();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> convert_int(realtestvec<real_t, size> x) {
+  return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline real_t maxval(realtestvec<real_t, size> x) {
+  return x.maxval();
+}
+
+template <typename real_t, int size>
+inline real_t minval(realtestvec<real_t, size> x) {
+  return x.minval();
+}
+
+template <typename real_t, int size>
+inline real_t prod(realtestvec<real_t, size> x) {
+  return x.prod();
+}
+
+template <typename real_t, int size>
+inline real_t sum(realtestvec<real_t, size> x) {
+  return x.sum();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> acos(realtestvec<real_t, size> x) {
+  return x.acos();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> acosh(realtestvec<real_t, size> x) {
+  return x.acosh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> asin(realtestvec<real_t, size> x) {
+  return x.asin();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> asinh(realtestvec<real_t, size> x) {
+  return x.asinh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> atan(realtestvec<real_t, size> x) {
+  return x.atan();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> atan2(realtestvec<real_t, size> x,
+                                       realtestvec<real_t, size> y) {
+  return x.atan2(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> atanh(realtestvec<real_t, size> x) {
+  return x.atanh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> cbrt(realtestvec<real_t, size> x) {
+  return x.cbrt();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> ceil(realtestvec<real_t, size> x) {
+  return x.ceil();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> copysign(realtestvec<real_t, size> x,
+                                          realtestvec<real_t, size> y) {
+  return x.copysign(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> cos(realtestvec<real_t, size> x) {
+  return x.cos();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> cosh(realtestvec<real_t, size> x) {
+  return x.cosh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> exp(realtestvec<real_t, size> x) {
+  return x.exp();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> exp10(realtestvec<real_t, size> x) {
+  return x.exp10();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> exp2(realtestvec<real_t, size> x) {
+  return x.exp2();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> expm1(realtestvec<real_t, size> x) {
+  return x.expm1();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fabs(realtestvec<real_t, size> x) {
+  return x.fabs();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> floor(realtestvec<real_t, size> x) {
+  return x.floor();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fdim(realtestvec<real_t, size> x,
+                                      realtestvec<real_t, size> y) {
+  return x.fdim(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fma(realtestvec<real_t, size> x,
+                                     realtestvec<real_t, size> y,
+                                     realtestvec<real_t, size> z) {
+  return x.fma(y, z);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fmax(realtestvec<real_t, size> x,
+                                      realtestvec<real_t, size> y) {
+  return x.fmax(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fmin(realtestvec<real_t, size> x,
+                                      realtestvec<real_t, size> y) {
+  return x.fmin(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fmod(realtestvec<real_t, size> x,
+                                      realtestvec<real_t, size> y) {
+  return x.fmod(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> frexp(realtestvec<real_t, size> x,
+                                       inttestvec<real_t, size> *r) {
+  return x.frexp(r);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> hypot(realtestvec<real_t, size> x,
+                                       realtestvec<real_t, size> y) {
+  return x.hypot(y);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> ilogb(realtestvec<real_t, size> x) {
+  return x.ilogb();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isfinite(realtestvec<real_t, size> x) {
+  return x.isfinite();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isinf(realtestvec<real_t, size> x) {
+  return x.isinf();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isnan(realtestvec<real_t, size> x) {
+  return x.isnan();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isnormal(realtestvec<real_t, size> x) {
+  return x.isnormal();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size>
+ldexp(realtestvec<real_t, size> x, typename inttestvec<real_t, size>::int_t n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> ldexp(realtestvec<real_t, size> x,
+                                       inttestvec<real_t, size> n) {
+  return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> log(realtestvec<real_t, size> x) {
+  return x.log();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> log10(realtestvec<real_t, size> x) {
+  return x.log10();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> log1p(realtestvec<real_t, size> x) {
+  return x.log1p();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> log2(realtestvec<real_t, size> x) {
+  return x.log2();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> lrint(realtestvec<real_t, size> x) {
+  return x.lrint();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> mad(realtestvec<real_t, size> x,
+                                     realtestvec<real_t, size> y,
+                                     realtestvec<real_t, size> z) {
+  return x.mad(y, z);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> nextafter(realtestvec<real_t, size> x,
+                                           realtestvec<real_t, size> y) {
+  return x.nextafter(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> pow(realtestvec<real_t, size> x,
+                                     realtestvec<real_t, size> y) {
+  return x.pow(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> rcp(realtestvec<real_t, size> x) {
+  return x.rcp();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> remainder(realtestvec<real_t, size> x,
+                                           realtestvec<real_t, size> y) {
+  return x.remainder(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> rint(realtestvec<real_t, size> x) {
+  return x.rint();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> round(realtestvec<real_t, size> x) {
+  return x.round();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> rsqrt(realtestvec<real_t, size> x) {
+  return x.rsqrt();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> signbit(realtestvec<real_t, size> x) {
+  return x.signbit();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> sin(realtestvec<real_t, size> x) {
+  return x.sin();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> sinh(realtestvec<real_t, size> x) {
+  return x.sinh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> sqrt(realtestvec<real_t, size> x) {
+  return x.sqrt();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> tan(realtestvec<real_t, size> x) {
+  return x.tan();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> tanh(realtestvec<real_t, size> x) {
+  return x.tanh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> trunc(realtestvec<real_t, size> x) {
+  return x.trunc();
+}
+
+#ifndef VML_NO_IOSTREAM
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, booltestvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, inttestvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, realtestvec<real_t, size> const &x) {
+  os << "[";
+  for (int i = 0; i < size; ++i) {
+    if (i != 0)
+      os << ",";
+    os << x[i];
+  }
+  os << "]";
+  return os;
+}
 #endif
-  
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_TEST_H
+#endif // #ifndef VEC_TEST_H
diff --git a/vec_vsx_double2.h b/vec_vsx_double2.h
index 6725859..fa43a6f 100644
--- a/vec_vsx_double2.h
+++ b/vec_vsx_double2.h
@@ -13,679 +13,572 @@
 #include <altivec.h>
 
 #if defined __clang__
-#  define __vector vector
-#  define __pixel pixel
-#  define __bool bool
+#define __vector vector
+#define __pixel pixel
+#define __bool bool
 #elif defined __gcc__
-#  undef vector
-#  undef pixel
-#  undef bool
+#undef vector
+#undef pixel
+#undef bool
 #elif defined __xlC__
-#  define __bool bool
+#define __bool bool
 #else
-#  error "Unknown compiler"
+#error "Unknown compiler"
 #endif
 
-
-
 namespace vecmathlib {
-  
+
 #define VECMATHLIB_HAVE_VEC_DOUBLE_2
-  template<> struct boolvec<double,2>;
-  template<> struct intvec<double,2>;
-  template<> struct realvec<double,2>;
-  
-  
-  
-  template<>
-  struct boolvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef bool scalar_t;
-    typedef __vector __bool long long bvector_t;
-    static int const alignment = sizeof(bvector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
-                  "vector size is wrong");
-    
-  private:
-    // true values are -1, false values are 0
-    // truth values are interpreted bit-wise
-    static uint_t from_bool(bool a) { return -int_t(a); }
-    static bool to_bool(uint_t a) { return a; }
-  public:
-    
-    typedef boolvec boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    bvector_t v;
-    
-    boolvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // boolvec(boolvec const& x): v(x.v) {}
-    // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
-    boolvec(bvector_t x): v(x) {}
-    boolvec(bool a): v((bvector_t)vec_splats((unsigned long long)from_bool(a))) {}
-    boolvec(bool const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator bvector_t() const { return v; }
-    bool operator[](int n) const
-    {
-      return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
-    }
-    boolvec& set_elt(int n, bool a)
-    {
-      return
-        vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
-    }
-    
-    
-    
-    intvec_t as_int() const;      // defined after intvec
-    intvec_t convert_int() const; // defined after intvec
-    
-    
-    
-    boolvec operator!() const { return vec_nor(v, v); }
-    
-    boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
-    boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
-    boolvec operator==(boolvec x) const { return !(*this!=x); }
-    boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
-    
-    bool all() const { return vec_all_ne(v, BV(false)); }
-    bool any() const { return vec_any_ne(v, BV(false)); }
-    
-    
-    
-    // ifthen(condition, then-value, else-value)
-    boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
-    intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
-    realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
-  };
-  
-  
-  
-  template<>
-  struct intvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef int_t scalar_t;
-    typedef __vector signed long long ivector_t;
-    static int const alignment = sizeof(ivector_t);
-    
-    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec intvec_t;
-    typedef realvec<real_t, size> realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    ivector_t v;
-    
-    intvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // intvec(intvec const& x): v(x.v) {}
-    // intvec& operator=(intvec const& x) { return v=x.v, *this; }
-    intvec(ivector_t x): v(x) {}
-    intvec(int_t a): v(vec_splats((long long)a)) {}
-    intvec(int_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    static intvec iota() { return (__vector signed long long){0, 1}; }
-    
-    operator ivector_t() const { return v; }
-    int_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
-    }
-    intvec_t& set_elt(int n, int_t a)
-    {
-      return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
-    }
-    
-    
-    
-    // Vector casts do not change the bit battern
-    boolvec_t as_bool() const { return (__vector __bool long long)v; }
-    boolvec_t convert_bool() const { return *this != IV(I(0)); }
-    realvec_t as_float() const;      // defined after realvec
-    realvec_t convert_float() const; // defined after realvec
-    
-    
-    
-    // Permutation control words
-  private:
-    // 0123 4567 -> 1436
-    // exchange pairs
-    static __vector unsigned char perm_int_swap()
-    {
-      return
-        (__vector unsigned char)
-        {4,5,6,7, 16,17,18,19, 12,13,14,15, 24,25,26,27};
-    }
-    // 0123 4567 -> 0426
-    // broadcast high elements of pairs
-    static __vector unsigned char perm_int_bchi()
-    {
-      return
-        (__vector unsigned char)
-        {0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
-    }
-  public:
-    
-    
-
-    intvec operator+() const { return *this; }
-    intvec operator-() const { return vec_neg(v); }
-    
-    intvec operator+(intvec x) const { return vec_add(v, x.v); }
-    intvec operator-(intvec x) const { return vec_sub(v, x.v); }
-    intvec operator*(intvec x) const { return vec_mul(v, x.v); }
-    intvec operator/(intvec x) const { return vec_div(v, x.v); }
-    intvec operator%(intvec x) const { return *this - *this / x * x; }
-    
-    intvec& operator+=(intvec const& x) { return *this=*this+x; }
-    intvec& operator-=(intvec const& x) { return *this=*this-x; }
-    intvec& operator*=(intvec const& x) { return *this=*this*x; }
-    intvec& operator/=(intvec const& x) { return *this=*this/x; }
-    intvec& operator%=(intvec const& x) { return *this=*this%x; }
-    
-    
-    
-    intvec operator~() const
-    {
-      return (__vector signed long long)vec_nor((__vector signed int)v, (__vector signed int)v);
-    }
-    
-    intvec operator&(intvec x) const
-    {
-      return (__vector signed long long)vec_and((__vector signed int)v, (__vector signed int)x.v);
-    }
-    intvec operator|(intvec x) const
-    {
-      return (__vector signed long long)vec_or ((__vector signed int)v, (__vector signed int)x.v);
-    }
-    intvec operator^(intvec x) const
-    {
-      return (__vector signed long long)vec_xor((__vector signed int)v, (__vector signed int)x.v);
-    }
-    
-    intvec& operator&=(intvec const& x) { return *this=*this&x; }
-    intvec& operator|=(intvec const& x) { return *this=*this|x; }
-    intvec& operator^=(intvec const& x) { return *this=*this^x; }
-    
-    intvec_t bitifthen(intvec_t x, intvec_t y) const;
-    
-    
-    
-    intvec lsr(int_t n) const { return lsr(IV(n)); }
-    intvec_t rotate(int_t n) const;
-    intvec operator>>(int_t n) const { return *this >> IV(n); }
-    intvec operator<<(int_t n) const { return *this << IV(n); }
-    intvec& operator>>=(int_t n) { return *this=*this>>n; }
-    intvec& operator<<=(int_t n) { return *this=*this<<n; }
-    
-    intvec lsr(intvec n) const
-    {
-      // return vec_sr(v, (__vector unsigned long long)n.v);
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, U((*this)[i]) >> U(n[i]));
-      }
-      return r;
-    }
-    intvec_t rotate(intvec_t n) const;
-    intvec operator>>(intvec n) const
-    {
-      // return vec_sra(v, (__vector unsigned long long)n.v);
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] >> n[i]);
-      }
-      return r;
-    }
-    intvec operator<<(intvec n) const
-    {
-      // return vec_sl(v, (__vector unsigned long long)n.v);
-      intvec r;
-      for (int i=0; i<size; ++i) {
-        r.set_elt(i, (*this)[i] << n[i]);
-      }
-      return r;
-    }
-    intvec& operator>>=(intvec n) { return *this=*this>>n; }
-    intvec& operator<<=(intvec n) { return *this=*this<<n; }
-    
-    intvec_t clz() const;
-    intvec_t popcount() const;
-    
-    
-    
-    boolvec_t operator==(intvec const& x) const
-    {
-      // return vec_cmpeq(v, x.v);
-      __vector signed int a = (__vector signed int)v;
-      __vector signed int b = (__vector signed int)x.v;
-      __vector __bool int c = vec_cmpeq(a, b);
-      __vector __bool int cx = vec_perm(c, c, perm_int_swap());
-      __vector __bool int r = vec_and(c, cx);
-      return (__vector __bool long long)r;
-    }
-    boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
-    boolvec_t operator<(intvec const& x) const
-    {
-      __vector signed int a = (__vector signed int)v;
-      __vector signed int b = (__vector signed int)x.v;
-      __vector __bool int lt = vec_cmplt(a, b);
-      __vector __bool int eq = vec_cmpeq(a, b);
-      __vector unsigned int ua = (__vector unsigned int)v;
-      __vector unsigned int ub = (__vector unsigned int)x.v;
-      __vector __bool int ult = vec_cmplt(ua, ub);
-      __vector __bool int ultx = vec_perm(ult, ult, perm_int_swap());
-      __vector __bool int r = vec_or(lt, vec_and(eq, ultx));
-      r = vec_perm(r, r, perm_int_bchi());
-      return (__vector __bool long long)r;
-    }
-    boolvec_t operator<=(intvec const& x) const
-    {
-      return ! (*this > x);
-    }
-    boolvec_t operator>(intvec const& x) const
-    {
-      return x < *this;
-    }
-    boolvec_t operator>=(intvec const& x) const
-    {
-      return ! (*this < x);
-    }
-    
-    intvec_t abs() const;
-    boolvec_t isignbit() const { return (*this >> (bits-1)).as_bool(); }
-    intvec_t max(intvec_t x) const;
-    intvec_t min(intvec_t x) const;
-  };
-  
-  
-  
-  template<>
-  struct realvec<double,2>: floatprops<double>
-  {
-    static int const size = 2;
-    typedef real_t scalar_t;
-    typedef __vector double vector_t;
-    static int const alignment = sizeof(vector_t);
-    
-    static char const* name() { return "<VSX:2*double>"; }
-    void barrier() { __asm__("": "+v"(v)); }
-    
-    static_assert(size * sizeof(real_t) == sizeof(vector_t),
-                  "vector size is wrong");
-    
-    typedef boolvec<real_t, size> boolvec_t;
-    typedef intvec<real_t, size> intvec_t;
-    typedef realvec realvec_t;
-    
-    // Short names for type casts
-    typedef real_t R;
-    typedef int_t I;
-    typedef uint_t U;
-    typedef realvec_t RV;
-    typedef intvec_t IV;
-    typedef boolvec_t BV;
-    typedef floatprops<real_t> FP;
-    typedef mathfuncs<realvec_t> MF;
-    
-    
-    
-    vector_t v;
-    
-    realvec() {}
-    // Can't have a non-trivial copy constructor; if so, objects won't
-    // be passed in registers
-    // realvec(realvec const& x): v(x.v) {}
-    // realvec& operator=(realvec const& x) { return v=x.v, *this; }
-    realvec(vector_t x): v(x) {}
-    realvec(real_t a): v(vec_splats(a)) {}
-    realvec(real_t const* as)
-    {
-      for (int d=0; d<size; ++d) set_elt(d, as[d]);
-    }
-    
-    operator vector_t() const { return v; }
-    real_t operator[](int n) const
-    {
-      return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
-    }
-    realvec_t& set_elt(int n, real_t a)
-    {
-      return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
-    }
-    
-    
-    
-    typedef vecmathlib::mask_t<realvec_t> mask_t;
-    
-    static realvec_t loada(real_t const* p)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      return vec_xld2(0, (real_t*)p);
-    }
-    static realvec_t loadu(real_t const* p)
-    {
-      // TODO: Can this handle unaligned access?
-      return vec_xld2(0, (real_t*)p);
-    }
-    static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff);
-      return loadu(p+ioff);
-    }
-    realvec_t loada(real_t const* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(all(m.m), true)) {
-        return loada(p);
-      } else {
-        return m.m.ifthen(loada(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        return loadu(p);
-      } else {
-        return m.m.ifthen(loadu(p), *this);
-      }
-    }
-    realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return loada(p+ioff, m);
-      return loadu(p+ioff, m);
-    }
-    
-    void storea(real_t* p) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      vec_xstd2(v, 0, p);
-    }
-    void storeu(real_t* p) const
-    {
-      // Vector stores would require vector loads, which would need to
-      // be atomic
-      // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
-      p[0] = (*this)[0];
-      p[1] = (*this)[1];
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff);
-      storeu(p+ioff);
-    }
-    void storea(real_t* p, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (__builtin_expect(m.all_m, true)) {
-        storea(p);
-      } else {
-        // Use vec_ste?
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-      }
-    }
-    void storeu(real_t* p, mask_t const& m) const
-    {
-      if (__builtin_expect(m.all_m, true)) {
-        storeu(p);
-      } else {
-        // Use vec_ste?
-        if (m.m[0]) p[0] = (*this)[0];
-        if (m.m[1]) p[1] = (*this)[1];
-      }
-    }
-    void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
-    {
-      VML_ASSERT(intptr_t(p) % alignment == 0);
-      if (ioff % realvec::size == 0) return storea(p+ioff, m);
-      storeu(p+ioff, m);
-    }
-    
-    
-    
-    intvec_t as_int() const { return (__vector signed long long) v; }
-    intvec_t convert_int() const { return MF::vml_convert_int(*this); }
-    
-    
-    
-    realvec operator+() const { return *this; }
-    realvec operator-() const { return RV(0.0) - *this; }
-    
-    realvec operator+(realvec x) const { return vec_add(v, x.v); }
-    realvec operator-(realvec x) const { return vec_sub(v, x.v); }
-    realvec operator*(realvec x) const { return vec_mul(v, x.v); }
-    realvec operator/(realvec x) const { return vec_div(v, x.v); }
-    
-    realvec& operator+=(realvec const& x) { return *this=*this+x; }
-    realvec& operator-=(realvec const& x) { return *this=*this-x; }
-    realvec& operator*=(realvec const& x) { return *this=*this*x; }
-    realvec& operator/=(realvec const& x) { return *this=*this/x; }
-    
-    real_t maxval() const
-    {
-      return vml_std::fmax((*this)[0], (*this)[1]);
+template <> struct boolvec<double, 2>;
+template <> struct intvec<double, 2>;
+template <> struct realvec<double, 2>;
+
+template <> struct boolvec<double, 2> : floatprops<double> {
+  static int const size = 2;
+  typedef bool scalar_t;
+  typedef __vector __bool long long bvector_t;
+  static int const alignment = sizeof(bvector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                "vector size is wrong");
+
+private:
+  // true values are -1, false values are 0
+  // truth values are interpreted bit-wise
+  static uint_t from_bool(bool a) { return -int_t(a); }
+  static bool to_bool(uint_t a) { return a; }
+
+public:
+  typedef boolvec boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  bvector_t v;
+
+  boolvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // boolvec(boolvec const& x): v(x.v) {}
+  // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+  boolvec(bvector_t x) : v(x) {}
+  boolvec(bool a)
+      : v((bvector_t)vec_splats((unsigned long long)from_bool(a))) {}
+  boolvec(bool const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator bvector_t() const { return v; }
+  bool operator[](int n) const {
+    return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+  }
+  boolvec &set_elt(int n, bool a) {
+    return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+           *this;
+  }
+
+  intvec_t as_int() const;      // defined after intvec
+  intvec_t convert_int() const; // defined after intvec
+
+  boolvec operator!() const { return vec_nor(v, v); }
+
+  boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
+  boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
+  boolvec operator==(boolvec x) const { return !(*this != x); }
+  boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
+
+  bool all() const { return vec_all_ne(v, BV(false)); }
+  bool any() const { return vec_any_ne(v, BV(false)); }
+
+  // ifthen(condition, then-value, else-value)
+  boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+  intvec_t ifthen(intvec_t x, intvec_t y) const;    // defined after intvec
+  realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 2> : floatprops<double> {
+  static int const size = 2;
+  typedef int_t scalar_t;
+  typedef __vector signed long long ivector_t;
+  static int const alignment = sizeof(ivector_t);
+
+  static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec intvec_t;
+  typedef realvec<real_t, size> realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  ivector_t v;
+
+  intvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // intvec(intvec const& x): v(x.v) {}
+  // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+  intvec(ivector_t x) : v(x) {}
+  intvec(int_t a) : v(vec_splats((long long)a)) {}
+  intvec(int_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+  static intvec iota() { return (__vector signed long long){0, 1}; }
+
+  operator ivector_t() const { return v; }
+  int_t operator[](int n) const {
+    return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+  }
+  intvec_t &set_elt(int n, int_t a) {
+    return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+  }
+
+  // Vector casts do not change the bit battern
+  boolvec_t as_bool() const { return (__vector __bool long long)v; }
+  boolvec_t convert_bool() const { return *this != IV(I(0)); }
+  realvec_t as_float() const;      // defined after realvec
+  realvec_t convert_float() const; // defined after realvec
+
+  // Permutation control words
+private:
+  // 0123 4567 -> 1436
+  // exchange pairs
+  static __vector unsigned char perm_int_swap() {
+    return (__vector unsigned char){4,  5,  6,  7,  16, 17, 18, 19,
+                                    12, 13, 14, 15, 24, 25, 26, 27};
+  }
+  // 0123 4567 -> 0426
+  // broadcast high elements of pairs
+  static __vector unsigned char perm_int_bchi() {
+    return (__vector unsigned char){0, 1, 2,  3,  16, 17, 18, 19,
+                                    8, 9, 10, 11, 24, 25, 26, 27};
+  }
+
+public:
+  intvec operator+() const { return *this; }
+  intvec operator-() const { return vec_neg(v); }
+
+  intvec operator+(intvec x) const { return vec_add(v, x.v); }
+  intvec operator-(intvec x) const { return vec_sub(v, x.v); }
+  intvec operator*(intvec x) const { return vec_mul(v, x.v); }
+  intvec operator/(intvec x) const { return vec_div(v, x.v); }
+  intvec operator%(intvec x) const { return *this - *this / x * x; }
+
+  intvec &operator+=(intvec const &x) { return *this = *this + x; }
+  intvec &operator-=(intvec const &x) { return *this = *this - x; }
+  intvec &operator*=(intvec const &x) { return *this = *this * x; }
+  intvec &operator/=(intvec const &x) { return *this = *this / x; }
+  intvec &operator%=(intvec const &x) { return *this = *this % x; }
+
+  intvec operator~() const {
+    return (__vector signed long long)vec_nor((__vector signed int)v,
+                                              (__vector signed int)v);
+  }
+
+  intvec operator&(intvec x) const {
+    return (__vector signed long long)vec_and((__vector signed int)v,
+                                              (__vector signed int)x.v);
+  }
+  intvec operator|(intvec x) const {
+    return (__vector signed long long)vec_or((__vector signed int)v,
+                                             (__vector signed int)x.v);
+  }
+  intvec operator^(intvec x) const {
+    return (__vector signed long long)vec_xor((__vector signed int)v,
+                                              (__vector signed int)x.v);
+  }
+
+  intvec &operator&=(intvec const &x) { return *this = *this & x; }
+  intvec &operator|=(intvec const &x) { return *this = *this | x; }
+  intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+  intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+  intvec lsr(int_t n) const { return lsr(IV(n)); }
+  intvec_t rotate(int_t n) const;
+  intvec operator>>(int_t n) const { return *this >> IV(n); }
+  intvec operator<<(int_t n) const { return *this << IV(n); }
+  intvec &operator>>=(int_t n) { return *this = *this >> n; }
+  intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+  intvec lsr(intvec n) const {
+    // return vec_sr(v, (__vector unsigned long long)n.v);
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, U((*this)[i]) >> U(n[i]));
     }
-    real_t minval() const
-    {
-      return vml_std::fmin((*this)[0], (*this)[1]);
+    return r;
+  }
+  intvec_t rotate(intvec_t n) const;
+  intvec operator>>(intvec n) const {
+    // return vec_sra(v, (__vector unsigned long long)n.v);
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] >> n[i]);
+    }
+    return r;
+  }
+  intvec operator<<(intvec n) const {
+    // return vec_sl(v, (__vector unsigned long long)n.v);
+    intvec r;
+    for (int i = 0; i < size; ++i) {
+      r.set_elt(i, (*this)[i] << n[i]);
     }
-    real_t prod() const
-    {
-      return (*this)[0] * (*this)[1];
+    return r;
+  }
+  intvec &operator>>=(intvec n) { return *this = *this >> n; }
+  intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+  intvec_t clz() const;
+  intvec_t popcount() const;
+
+  boolvec_t operator==(intvec const &x) const {
+    // return vec_cmpeq(v, x.v);
+    __vector signed int a = (__vector signed int)v;
+    __vector signed int b = (__vector signed int)x.v;
+    __vector __bool int c = vec_cmpeq(a, b);
+    __vector __bool int cx = vec_perm(c, c, perm_int_swap());
+    __vector __bool int r = vec_and(c, cx);
+    return (__vector __bool long long)r;
+  }
+  boolvec_t operator!=(intvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(intvec const &x) const {
+    __vector signed int a = (__vector signed int)v;
+    __vector signed int b = (__vector signed int)x.v;
+    __vector __bool int lt = vec_cmplt(a, b);
+    __vector __bool int eq = vec_cmpeq(a, b);
+    __vector unsigned int ua = (__vector unsigned int)v;
+    __vector unsigned int ub = (__vector unsigned int)x.v;
+    __vector __bool int ult = vec_cmplt(ua, ub);
+    __vector __bool int ultx = vec_perm(ult, ult, perm_int_swap());
+    __vector __bool int r = vec_or(lt, vec_and(eq, ultx));
+    r = vec_perm(r, r, perm_int_bchi());
+    return (__vector __bool long long)r;
+  }
+  boolvec_t operator<=(intvec const &x) const { return !(*this > x); }
+  boolvec_t operator>(intvec const &x) const { return x < *this; }
+  boolvec_t operator>=(intvec const &x) const { return !(*this < x); }
+
+  intvec_t abs() const;
+  boolvec_t isignbit() const { return (*this >> (bits - 1)).as_bool(); }
+  intvec_t max(intvec_t x) const;
+  intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 2> : floatprops<double> {
+  static int const size = 2;
+  typedef real_t scalar_t;
+  typedef __vector double vector_t;
+  static int const alignment = sizeof(vector_t);
+
+  static char const *name() { return "<VSX:2*double>"; }
+  void barrier() { __asm__("" : "+v"(v)); }
+
+  static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                "vector size is wrong");
+
+  typedef boolvec<real_t, size> boolvec_t;
+  typedef intvec<real_t, size> intvec_t;
+  typedef realvec realvec_t;
+
+  // Short names for type casts
+  typedef real_t R;
+  typedef int_t I;
+  typedef uint_t U;
+  typedef realvec_t RV;
+  typedef intvec_t IV;
+  typedef boolvec_t BV;
+  typedef floatprops<real_t> FP;
+  typedef mathfuncs<realvec_t> MF;
+
+  vector_t v;
+
+  realvec() {}
+  // Can't have a non-trivial copy constructor; if so, objects won't
+  // be passed in registers
+  // realvec(realvec const& x): v(x.v) {}
+  // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+  realvec(vector_t x) : v(x) {}
+  realvec(real_t a) : v(vec_splats(a)) {}
+  realvec(real_t const *as) {
+    for (int d = 0; d < size; ++d)
+      set_elt(d, as[d]);
+  }
+
+  operator vector_t() const { return v; }
+  real_t operator[](int n) const {
+    return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+  }
+  realvec_t &set_elt(int n, real_t a) {
+    return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+  }
+
+  typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+  static realvec_t loada(real_t const *p) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    return vec_xld2(0, (real_t *)p);
+  }
+  static realvec_t loadu(real_t const *p) {
+    // TODO: Can this handle unaligned access?
+    return vec_xld2(0, (real_t *)p);
+  }
+  static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff);
+    return loadu(p + ioff);
+  }
+  realvec_t loada(real_t const *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(all(m.m), true)) {
+      return loada(p);
+    } else {
+      return m.m.ifthen(loada(p), *this);
     }
-    real_t sum() const
-    {
-      return (*this)[0] + (*this)[1];
+  }
+  realvec_t loadu(real_t const *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      return loadu(p);
+    } else {
+      return m.m.ifthen(loadu(p), *this);
     }
-    
-    
-    
-    boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); }
-    boolvec_t operator!=(realvec const& x) const { return ! (*this == x); }
-    boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); }
-    boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); }
-    boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); }
-    boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); }
-    
-    
-    
-    realvec acos() const { return MF::vml_acos(*this); }
-    realvec acosh() const { return MF::vml_acosh(*this); }
-    realvec asin() const { return MF::vml_asin(*this); }
-    realvec asinh() const { return MF::vml_asinh(*this); }
-    realvec atan() const { return MF::vml_atan(*this); }
-    realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
-    realvec atanh() const { return MF::vml_atanh(*this); }
-    realvec cbrt() const { return MF::vml_cbrt(*this); }
-    realvec ceil() const { return vec_ceil(v); }
-    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
-    realvec cos() const { return MF::vml_cos(*this); }
-    realvec cosh() const { return MF::vml_cosh(*this); }
-    realvec exp() const { return MF::vml_exp(*this); }
-    realvec exp10() const { return MF::vml_exp10(*this); }
-    realvec exp2() const { return MF::vml_exp2(*this); }
-    realvec expm1() const { return MF::vml_expm1(*this); }
-    realvec fabs() const { return vec_abs(v); }
-    realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
-    realvec floor() const { return vec_floor(v); }
-    realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
-    realvec fmax(realvec y) const { return vec_max(v, y.v); }
-    realvec fmin(realvec y) const { return vec_min(v, y.v); }
-    realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
-    realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
-    realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
-    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
-    boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
-    boolvec_t isinf() const { return MF::vml_isinf(*this); }
-    boolvec_t isnan() const { return MF::vml_isnan(*this); }
-    boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
-    realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
-    realvec log() const { return MF::vml_log(*this); }
-    realvec log10() const { return MF::vml_log10(*this); }
-    realvec log1p() const { return MF::vml_log1p(*this); }
-    realvec log2() const { return MF::vml_log2(*this); }
-    realvec_t mad(realvec_t y, realvec_t z) const
-    {
-      return MF::vml_mad(*this, y, z);
+  }
+  realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return loada(p + ioff, m);
+    return loadu(p + ioff, m);
+  }
+
+  void storea(real_t *p) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    vec_xstd2(v, 0, p);
+  }
+  void storeu(real_t *p) const {
+    // Vector stores would require vector loads, which would need to
+    // be atomic
+    // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html>
+    // for good ideas
+    p[0] = (*this)[0];
+    p[1] = (*this)[1];
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff);
+    storeu(p + ioff);
+  }
+  void storea(real_t *p, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (__builtin_expect(m.all_m, true)) {
+      storea(p);
+    } else {
+      // Use vec_ste?
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
     }
-    realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
-    realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
-    realvec rcp() const
-    {
-      realvec x = *this;
-      realvec r = vec_re(v);    // this is only an approximation
-      // TODO: use fma
-      // Note: don't rewrite this expression, this may introduce
-      // cancellation errors
-      r += r * (RV(1.0) - x*r); // two Newton iterations (see vml_rcp)
-      r += r * (RV(1.0) - x*r);
-      return r;
+  }
+  void storeu(real_t *p, mask_t const &m) const {
+    if (__builtin_expect(m.all_m, true)) {
+      storeu(p);
+    } else {
+      // Use vec_ste?
+      if (m.m[0])
+        p[0] = (*this)[0];
+      if (m.m[1])
+        p[1] = (*this)[1];
     }
-    realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
-    realvec rint() const { return vec_round(v); /* sic! */}
-    realvec round() const { return MF::vml_round(*this); }
-    realvec rsqrt() const { return RV(1.0) / sqrt(); }
-    boolvec_t signbit() const { return MF::vml_signbit(*this); }
-    realvec sin() const { return MF::vml_sin(*this); }
-    realvec sinh() const { return MF::vml_sinh(*this); }
-    realvec sqrt() const { return vec_sqrt(v); }
-    realvec tan() const { return MF::vml_tan(*this); }
-    realvec tanh() const { return MF::vml_tanh(*this); }
-    realvec trunc() const { return vec_trunc(v); }
-  };
-  
-  
-  
-  // boolvec definitions
-  
-  inline intvec<double,2> boolvec<double,2>::as_int() const
-  {
-    return (__vector signed long long) v;
-  }
-  
-  inline intvec<double,2> boolvec<double,2>::convert_int() const
-  {
-    return -(__vector signed long long)v;
-  }
-  
-  inline
-  boolvec<double,2> boolvec<double,2>::ifthen(boolvec_t x, boolvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  inline
-  intvec<double,2> boolvec<double,2>::ifthen(intvec_t x, intvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  inline
-  realvec<double,2> boolvec<double,2>::ifthen(realvec_t x, realvec_t y) const
-  {
-    return vec_sel(y.v, x.v, v);
-  }
-  
-  
-  
-  // intvec definitions
-  
-  inline intvec<double,2> intvec<double,2>::abs() const
-  {
-    return MF::vml_abs(*this);
-  }
-  
-  inline realvec<double,2> intvec<double,2>::as_float() const
-  {
-    return (__vector double)v;
-  }
-  
-  inline intvec<double,2> intvec<double,2>::bitifthen(intvec_t x,
-						      intvec_t y) const
-  {
-    return MF::vml_bitifthen(*this, x, y);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::clz() const
-  {
-    return MF::vml_clz(*this);
-  }
-  
-  inline realvec<double,2> intvec<double,2>::convert_float() const
-  {
-    // return vec_ctd(v, 0);
-    return MF::vml_convert_float(*this);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::max(intvec_t x) const
-  {
-    return MF::vml_max(*this, x);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::min(intvec_t x) const
-  {
-    return MF::vml_min(*this, x);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::popcount() const
-  {
-    return MF::vml_popcount(*this);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::rotate(int_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
-  inline intvec<double,2> intvec<double,2>::rotate(intvec_t n) const
-  {
-    return MF::vml_rotate(*this, n);
-  }
-  
+  }
+  void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+    VML_ASSERT(intptr_t(p) % alignment == 0);
+    if (ioff % realvec::size == 0)
+      return storea(p + ioff, m);
+    storeu(p + ioff, m);
+  }
+
+  intvec_t as_int() const { return (__vector signed long long)v; }
+  intvec_t convert_int() const { return MF::vml_convert_int(*this); }
+
+  realvec operator+() const { return *this; }
+  realvec operator-() const { return RV(0.0) - *this; }
+
+  realvec operator+(realvec x) const { return vec_add(v, x.v); }
+  realvec operator-(realvec x) const { return vec_sub(v, x.v); }
+  realvec operator*(realvec x) const { return vec_mul(v, x.v); }
+  realvec operator/(realvec x) const { return vec_div(v, x.v); }
+
+  realvec &operator+=(realvec const &x) { return *this = *this + x; }
+  realvec &operator-=(realvec const &x) { return *this = *this - x; }
+  realvec &operator*=(realvec const &x) { return *this = *this * x; }
+  realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+  real_t maxval() const { return vml_std::fmax((*this)[0], (*this)[1]); }
+  real_t minval() const { return vml_std::fmin((*this)[0], (*this)[1]); }
+  real_t prod() const { return (*this)[0] * (*this)[1]; }
+  real_t sum() const { return (*this)[0] + (*this)[1]; }
+
+  boolvec_t operator==(realvec const &x) const { return vec_cmpeq(v, x.v); }
+  boolvec_t operator!=(realvec const &x) const { return !(*this == x); }
+  boolvec_t operator<(realvec const &x) const { return vec_cmplt(v, x.v); }
+  boolvec_t operator<=(realvec const &x) const { return vec_cmple(v, x.v); }
+  boolvec_t operator>(realvec const &x) const { return vec_cmpgt(v, x.v); }
+  boolvec_t operator>=(realvec const &x) const { return vec_cmpge(v, x.v); }
+
+  realvec acos() const { return MF::vml_acos(*this); }
+  realvec acosh() const { return MF::vml_acosh(*this); }
+  realvec asin() const { return MF::vml_asin(*this); }
+  realvec asinh() const { return MF::vml_asinh(*this); }
+  realvec atan() const { return MF::vml_atan(*this); }
+  realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+  realvec atanh() const { return MF::vml_atanh(*this); }
+  realvec cbrt() const { return MF::vml_cbrt(*this); }
+  realvec ceil() const { return vec_ceil(v); }
+  realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+  realvec cos() const { return MF::vml_cos(*this); }
+  realvec cosh() const { return MF::vml_cosh(*this); }
+  realvec exp() const { return MF::vml_exp(*this); }
+  realvec exp10() const { return MF::vml_exp10(*this); }
+  realvec exp2() const { return MF::vml_exp2(*this); }
+  realvec expm1() const { return MF::vml_expm1(*this); }
+  realvec fabs() const { return vec_abs(v); }
+  realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+  realvec floor() const { return vec_floor(v); }
+  realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
+  realvec fmax(realvec y) const { return vec_max(v, y.v); }
+  realvec fmin(realvec y) const { return vec_min(v, y.v); }
+  realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+  realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+  realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+  intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+  boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+  boolvec_t isinf() const { return MF::vml_isinf(*this); }
+  boolvec_t isnan() const { return MF::vml_isnan(*this); }
+  boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+  realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+  realvec log() const { return MF::vml_log(*this); }
+  realvec log10() const { return MF::vml_log10(*this); }
+  realvec log1p() const { return MF::vml_log1p(*this); }
+  realvec log2() const { return MF::vml_log2(*this); }
+  realvec_t mad(realvec_t y, realvec_t z) const {
+    return MF::vml_mad(*this, y, z);
+  }
+  realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+  realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+  realvec rcp() const {
+    realvec x = *this;
+    realvec r = vec_re(v); // this is only an approximation
+    // TODO: use fma
+    // Note: don't rewrite this expression, this may introduce
+    // cancellation errors
+    r += r * (RV(1.0) - x * r); // two Newton iterations (see vml_rcp)
+    r += r * (RV(1.0) - x * r);
+    return r;
+  }
+  realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+  realvec rint() const { return vec_round(v); /* sic! */ }
+  realvec round() const { return MF::vml_round(*this); }
+  realvec rsqrt() const { return RV(1.0) / sqrt(); }
+  boolvec_t signbit() const { return MF::vml_signbit(*this); }
+  realvec sin() const { return MF::vml_sin(*this); }
+  realvec sinh() const { return MF::vml_sinh(*this); }
+  realvec sqrt() const { return vec_sqrt(v); }
+  realvec tan() const { return MF::vml_tan(*this); }
+  realvec tanh() const { return MF::vml_tanh(*this); }
+  realvec trunc() const { return vec_trunc(v); }
+};
+
+// boolvec definitions
+
+inline intvec<double, 2> boolvec<double, 2>::as_int() const {
+  return (__vector signed long long)v;
+}
+
+inline intvec<double, 2> boolvec<double, 2>::convert_int() const {
+  return -(__vector signed long long)v;
+}
+
+inline boolvec<double, 2> boolvec<double, 2>::ifthen(boolvec_t x,
+                                                     boolvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+inline intvec<double, 2> boolvec<double, 2>::ifthen(intvec_t x,
+                                                    intvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+inline realvec<double, 2> boolvec<double, 2>::ifthen(realvec_t x,
+                                                     realvec_t y) const {
+  return vec_sel(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline intvec<double, 2> intvec<double, 2>::abs() const {
+  return MF::vml_abs(*this);
+}
+
+inline realvec<double, 2> intvec<double, 2>::as_float() const {
+  return (__vector double)v;
+}
+
+inline intvec<double, 2> intvec<double, 2>::bitifthen(intvec_t x,
+                                                      intvec_t y) const {
+  return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 2> intvec<double, 2>::clz() const {
+  return MF::vml_clz(*this);
+}
+
+inline realvec<double, 2> intvec<double, 2>::convert_float() const {
+  // return vec_ctd(v, 0);
+  return MF::vml_convert_float(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::max(intvec_t x) const {
+  return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 2> intvec<double, 2>::min(intvec_t x) const {
+  return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 2> intvec<double, 2>::popcount() const {
+  return MF::vml_popcount(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::rotate(int_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 2> intvec<double, 2>::rotate(intvec_t n) const {
+  return MF::vml_rotate(*this, n);
+}
+
 } // namespace vecmathlib
 
-#endif  // #ifndef VEC_VSX_DOUBLE2_H
+#endif // #ifndef VEC_VSX_DOUBLE2_H
diff --git a/vecmathlib.h b/vecmathlib.h
index 9accd24..0d72add 100644
--- a/vecmathlib.h
+++ b/vecmathlib.h
@@ -4,16 +4,14 @@
 #define VECMATHLIB_H
 
 #if defined VML_DEBUG || defined VML_NODEBUG
-#  if defined VML_DEBUG && defined VML_NODEBUG
-#    error "Only one of VML_DEBUG or VML_NODEBUG may be defined"
-#  endif
+#if defined VML_DEBUG && defined VML_NODEBUG
+#error "Only one of VML_DEBUG or VML_NODEBUG may be defined"
+#endif
 #else
 // default
-#  define VML_DEBUG
+#define VML_DEBUG
 #endif
 
-
-
 // FP settings
 
 // Possible effects of not having VML_HAVE_FP_CONTRACT:
@@ -23,7 +21,7 @@
 // - can evaluate functions with reduced precision (80% of significant digits)
 
 // default settings
-#undef VML_HAVE_DENORMALS       // TODO
+#undef VML_HAVE_DENORMALS // TODO
 #define VML_HAVE_FP_CONTRACT
 #define VML_HAVE_INF
 #define VML_HAVE_NAN
@@ -31,63 +29,59 @@
 
 // optimized settings
 #ifdef __FAST_MATH__
-#  undef VML_HAVE_DENORMALS
-#  undef VML_HAVE_FP_CONTRACT
-#  undef VML_HAVE_INF
-#  undef VML_HAVE_NAN
+#undef VML_HAVE_DENORMALS
+#undef VML_HAVE_FP_CONTRACT
+#undef VML_HAVE_INF
+#undef VML_HAVE_NAN
 #endif
 
 #ifdef VML_DEBUG
-#  define VML_CONFIG_DEBUG " debug"
+#define VML_CONFIG_DEBUG " debug"
 #else
-#  define VML_CONFIG_DEBUG " no-debug"
+#define VML_CONFIG_DEBUG " no-debug"
 #endif
 #ifdef VML_DENORMALS
-#  define VML_CONFIG_DENORMALS " denormals"
+#define VML_CONFIG_DENORMALS " denormals"
 #else
-#  define VML_CONFIG_DENORMALS " no-denormals"
+#define VML_CONFIG_DENORMALS " no-denormals"
 #endif
 #ifdef VML_FP_CONTRACT
-#  define VML_CONFIG_FP_CONTRACT " fp-contract"
+#define VML_CONFIG_FP_CONTRACT " fp-contract"
 #else
-#  define VML_CONFIG_FP_CONTRACT " no-fp-contract"
+#define VML_CONFIG_FP_CONTRACT " no-fp-contract"
 #endif
 #ifdef VML_INF
-#  define VML_CONFIG_INF " inf"
+#define VML_CONFIG_INF " inf"
 #else
-#  define VML_CONFIG_INF " no-inf"
+#define VML_CONFIG_INF " no-inf"
 #endif
 #ifdef VML_NAN
-#  define VML_CONFIG_NAN " nan"
+#define VML_CONFIG_NAN " nan"
 #else
-#  define VML_CONFIG_NAN " no-nan"
+#define VML_CONFIG_NAN " no-nan"
 #endif
 
 // TODO: introduce mad, as fast version of fma (check FP_FAST_FMA)
 // TODO: introduce ieee_isnan and friends
 // TODO: switch between isnan and ieee_isnan at an outside level
 
-
-
 // This workaround is needed for older libstdc++ versions such as the
 // one in Debian 6.0 when compiled with clang++
 // <http://lists.cs.uiuc.edu/pipermail/cfe-dev/2011-February/013207.html>.
 // The version time stamp used below is the one in Debian 6.0.
-#include <cstring>              // pull in __GLIBCXX__
+#include <cstring> // pull in __GLIBCXX__
 #if defined __GLIBCXX__ && __GLIBCXX__ <= 20101114
-namespace std { class type_info; }
+namespace std {
+class type_info;
+}
 #endif
 
-
-
 #include <cassert>
 
-
-
 #ifdef VML_DEBUG
-#  define VML_ASSERT(x) assert(x)
+#define VML_ASSERT(x) assert(x)
 #else
-#  define VML_ASSERT(x) ((void)0)
+#define VML_ASSERT(x) ((void)0)
 #endif
 
 // Scalarise all vector operations, and use libm's functions (mostly
@@ -96,146 +90,142 @@ namespace std { class type_info; }
 
 #ifdef __clang__
 // Use compiler-provided vector types
-#  include "vec_builtin.h"
+#include "vec_builtin.h"
 #endif
 
 // Scalarise all vector operations; don't use libm, use only
 // Vecmathlib's functions (mostly useful for testing Vecmathlib)
 #include "vec_test.h"
 
-#if defined __ARM_NEON__        // ARM NEON
-#  include "vec_neon_float2.h"
-#  include "vec_neon_float4.h"
-#  define VML_CONFIG_NEON " NEON"
-#else
-#  define VML_CONFIG_NEON
-#endif
-
-#if defined __SSE2__            // Intel SSE 2
-#  include "vec_sse_float1.h"
-#  include "vec_sse_float4.h"
-#  include "vec_sse_double1.h"
-#  include "vec_sse_double2.h"
-#  if defined __SSE3__
-#    define VML_CONFIG_SSE3 " SSE3"
-#  else
-#    define VML_CONFIG_SSE3
-#  endif
-#  if defined __SSSE3__
-#    define VML_CONFIG_SSSE3 " SSSE3"
-#  else
-#    define VML_CONFIG_SSSE3
-#  endif
-#  if defined __SSE4_1__
-#    define VML_CONFIG_SSE4_1 " SSE4.1"
-#  else
-#    define VML_CONFIG_SSE4_1
-#  endif
-#  if defined __SSE4a__
-#    define VML_CONFIG_SSE4a " SSE4a"
-#  else
-#    define VML_CONFIG_SSE4a
-#  endif
-#  define VML_CONFIG_SSE2 " SSE2" VML_CONFIG_SSE3 VML_CONFIG_SSSE3 VML_CONFIG_SSE4_1 VML_CONFIG_SSE4a
-#else
-#  define VML_CONFIG_SSE2
-#endif
-
-#if defined __AVX__             // Intel AVX
-#  include "vec_avx_fp8_32.h"
-#  include "vec_avx_fp16_16.h"
-#  include "vec_avx_float8.h"
-#  include "vec_avx_double4.h"
-#  define VML_CONFIG_AVX " AVX"
-#else
-#  define VML_CONFIG_AVX
-#endif
-
-#if defined __MIC__             // Intel MIC
+#if defined __ARM_NEON__ // ARM NEON
+#include "vec_neon_float2.h"
+#include "vec_neon_float4.h"
+#define VML_CONFIG_NEON " NEON"
+#else
+#define VML_CONFIG_NEON
+#endif
+
+#if defined __SSE2__ // Intel SSE 2
+#include "vec_sse_float1.h"
+#include "vec_sse_float4.h"
+#include "vec_sse_double1.h"
+#include "vec_sse_double2.h"
+#if defined __SSE3__
+#define VML_CONFIG_SSE3 " SSE3"
+#else
+#define VML_CONFIG_SSE3
+#endif
+#if defined __SSSE3__
+#define VML_CONFIG_SSSE3 " SSSE3"
+#else
+#define VML_CONFIG_SSSE3
+#endif
+#if defined __SSE4_1__
+#define VML_CONFIG_SSE4_1 " SSE4.1"
+#else
+#define VML_CONFIG_SSE4_1
+#endif
+#if defined __SSE4a__
+#define VML_CONFIG_SSE4a " SSE4a"
+#else
+#define VML_CONFIG_SSE4a
+#endif
+#define VML_CONFIG_SSE2                                                        \
+  " SSE2" VML_CONFIG_SSE3 VML_CONFIG_SSSE3 VML_CONFIG_SSE4_1 VML_CONFIG_SSE4a
+#else
+#define VML_CONFIG_SSE2
+#endif
+
+#if defined __AVX__ // Intel AVX
+#include "vec_avx_fp8_32.h"
+#include "vec_avx_fp16_16.h"
+#include "vec_avx_float8.h"
+#include "vec_avx_double4.h"
+#define VML_CONFIG_AVX " AVX"
+#else
+#define VML_CONFIG_AVX
+#endif
+
+#if defined __MIC__ // Intel MIC
 // TODO: single precision?
-#  include "vec_mic_double8.h"
-#  define VML_CONFIG_MIC " MIC"
+#include "vec_mic_double8.h"
+#define VML_CONFIG_MIC " MIC"
 #else
-#  define VML_CONFIG_MIC
+#define VML_CONFIG_MIC
 #endif
 
-#if defined __ALTIVEC__         // IBM Altivec
-#  include "vec_altivec_float4.h"
-#  define VML_CONFIG_ALTIVEC " Altivec"
+#if defined __ALTIVEC__ // IBM Altivec
+#include "vec_altivec_float4.h"
+#define VML_CONFIG_ALTIVEC " Altivec"
 #else
-#  define VML_CONFIG_ALTIVEC
+#define VML_CONFIG_ALTIVEC
 #endif
 #if defined __ALTIVEC__ && defined _ARCH_PWR7 // IBM VSX
-#  include "vec_vsx_double2.h"
-#  define VML_CONFIG_VSX " VSX"
+#include "vec_vsx_double2.h"
+#define VML_CONFIG_VSX " VSX"
 #else
-#  define VML_CONFIG_VSX
+#define VML_CONFIG_VSX
 #endif
 
 // TODO: IBM Blue Gene/P DoubleHummer
 
 #if defined __bgq__ && defined __VECTOR4DOUBLE__ // IBM Blue Gene/Q QPX
 // TODO: vec_qpx_float4
-#  include "vec_qpx_double4.h"
-#  define VML_CONFIG_QPX " QPX"
+#include "vec_qpx_double4.h"
+#define VML_CONFIG_QPX " QPX"
 #else
-#  define VML_CONFIG_QPX
+#define VML_CONFIG_QPX
 #endif
 
-#define VECMATHLIB_CONFIGURATION                                        \
-  "VecmathlibConfiguration"                                             \
-  VML_CONFIG_DEBUG                                                      \
-  VML_CONFIG_DENORMALS VML_CONFIG_FP_CONTRACT VML_CONFIG_INF VML_CONFIG_NAN \
-  VML_CONFIG_NEON                                                       \
-  VML_CONFIG_SSE2 VML_CONFIG_AVX VML_CONFIG_MIC                         \
-  VML_CONFIG_ALTIVEC VML_CONFIG_VSX                                     \
-  VML_CONFIG_QPX
-
-
+#define VECMATHLIB_CONFIGURATION                                               \
+  "VecmathlibConfiguration" VML_CONFIG_DEBUG VML_CONFIG_DENORMALS              \
+      VML_CONFIG_FP_CONTRACT VML_CONFIG_INF VML_CONFIG_NAN VML_CONFIG_NEON     \
+          VML_CONFIG_SSE2 VML_CONFIG_AVX VML_CONFIG_MIC VML_CONFIG_ALTIVEC     \
+              VML_CONFIG_VSX VML_CONFIG_QPX
 
 // Define "best" vector types
 namespace vecmathlib {
-  
+
 #if defined VECMATHLIB_HAVE_VEC_FLOAT_16
-#  define VECMATHLIB_MAX_FLOAT_VECSIZE 16
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 16
 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_8
-#  define VECMATHLIB_MAX_FLOAT_VECSIZE 8
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 8
 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_4
-#  define VECMATHLIB_MAX_FLOAT_VECSIZE 4
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 4
 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_2
-#  define VECMATHLIB_MAX_FLOAT_VECSIZE 2
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 2
 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_1
-#  define VECMATHLIB_MAX_FLOAT_VECSIZE 1
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 1
 #endif
-  
+
 #if defined VECMATHLIB_HAVE_VEC_DOUBLE_8
-#  define VECMATHLIB_MAX_DOUBLE_VECSIZE 8
+#define VECMATHLIB_MAX_DOUBLE_VECSIZE 8
 #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_4
-#  define VECMATHLIB_MAX_DOUBLE_VECSIZE 4
+#define VECMATHLIB_MAX_DOUBLE_VECSIZE 4
 #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2
-#  define VECMATHLIB_MAX_DOUBLE_VECSIZE 2
+#define VECMATHLIB_MAX_DOUBLE_VECSIZE 2
 #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_1
-#  define VECMATHLIB_MAX_DOUBLE_VECSIZE 1
+#define VECMATHLIB_MAX_DOUBLE_VECSIZE 1
 #endif
-  
+
 #ifdef VECMATHLIB_MAX_FLOAT_VECSIZE
-  typedef realvec<float,VECMATHLIB_MAX_FLOAT_VECSIZE> float32_vec;
-  typedef intvec<float,VECMATHLIB_MAX_FLOAT_VECSIZE>  int32_vec;
-  typedef boolvec<float,VECMATHLIB_MAX_FLOAT_VECSIZE> bool32_vec;
+typedef realvec<float, VECMATHLIB_MAX_FLOAT_VECSIZE> float32_vec;
+typedef intvec<float, VECMATHLIB_MAX_FLOAT_VECSIZE> int32_vec;
+typedef boolvec<float, VECMATHLIB_MAX_FLOAT_VECSIZE> bool32_vec;
 #else
-  typedef realpseudovec<float,1> float32_vec;
-  typedef intpseudovec<float,1>  int32_vec;
-  typedef boolpseudovec<float,1> bool32_vec;
+typedef realpseudovec<float, 1> float32_vec;
+typedef intpseudovec<float, 1> int32_vec;
+typedef boolpseudovec<float, 1> bool32_vec;
 #endif
-  
+
 #ifdef VECMATHLIB_MAX_DOUBLE_VECSIZE
-  typedef realvec<double,VECMATHLIB_MAX_DOUBLE_VECSIZE> float64_vec;
-  typedef intvec<double,VECMATHLIB_MAX_DOUBLE_VECSIZE>  int64_vec;
-  typedef boolvec<double,VECMATHLIB_MAX_DOUBLE_VECSIZE> bool64_vec;
+typedef realvec<double, VECMATHLIB_MAX_DOUBLE_VECSIZE> float64_vec;
+typedef intvec<double, VECMATHLIB_MAX_DOUBLE_VECSIZE> int64_vec;
+typedef boolvec<double, VECMATHLIB_MAX_DOUBLE_VECSIZE> bool64_vec;
 #else
-  typedef realpseudovec<double,1> float64_vec;
-  typedef intpseudovec<double,1>  int64_vec;
-  typedef boolpseudovec<double,1> bool64_vec;
+typedef realpseudovec<double, 1> float64_vec;
+typedef intpseudovec<double, 1> int64_vec;
+typedef boolpseudovec<double, 1> bool64_vec;
 #endif
 }