Import initial version

author: Erik Schnetter <schnetter@gmail.com> 2012-11-30 15:50:03 -0500
committer: Erik Schnetter <schnetter@gmail.com> 2012-11-30 15:50:03 -0500
commit: d2614759a1d542c41af59b04d8711246d2a1e876 (patch)
tree: 2689209034d9ccc3d29208d50b82b4f89116aa1b
download: vecmathlib-d2614759a1d542c41af59b04d8711246d2a1e876.zip
vecmathlib-d2614759a1d542c41af59b04d8711246d2a1e876.tar.gz
17 files changed, 1895 insertions, 0 deletions
diff --git a/IDEAS b/IDEAS
new file mode 100644
index 0000000..6a2f385
--- /dev/null
+++ b/IDEAS
@@ -0,0 +1,7 @@
+implement cosh, sinh via expansion, similar to asin
+   https://en.wikipedia.org/wiki/Hyperbolic_function
+
+implement exp via cosh, sinh
+
+implement acosh, asinh via expansion, similar to asin
+   https://en.wikipedia.org/wiki/Inverse_hyperbolic_function
diff --git a/build.ninja b/build.ninja
new file mode 100644
index 0000000..fbfd02b
--- /dev/null
+++ b/build.ninja
@@ -0,0 +1,33 @@
+ar  = ar
+cxx = g++
+
+cppflags =
+cxxflags = -std=gnu++11 -Wall -g -march=native
+# -Ofast
+ldflags = -L.
+
+rule cxx
+  command = $cxx $cppflags -MMD -MT $out -MF $out.d $cxxflags -c $in -o $out
+  description = CXX $out
+  depfile = $out.d
+
+rule ar
+  command = rm -f $out && $ar crs $out $in && ranlib $out
+  description = AR $out
+
+rule link
+  command = $cxx $cppflags $cxxflags $ldflags -o $out $in $libs
+  description = LINK $out
+
+build empty.o: cxx empty.cc
+build example.o: cxx example.cc
+build test.o: cxx test.cc
+
+build libvecmath.a: ar empty.o
+
+build example: link example.o | libvecmath.a
+  libs = -lvecmath
+build test: link test.o | libvecmath.a
+  libs = -lvecmath
+
+default example test
diff --git a/empty.cc b/empty.cc
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/empty.cc
diff --git a/example.cc b/example.cc
new file mode 100644
index 0000000..50415c2
--- /dev/null
+++ b/example.cc
@@ -0,0 +1,32 @@
+// -*-C++-*-
+
+#include "vec_float.h"
+#include "vec_double_avx.h"
+
+#include <iostream>
+
+using namespace std;
+
+int main(int argc, char** argv)
+{
+  using namespace vecmathlib;
+  typedef realvec<float,1> realvec_t;
+  // typedef realvec<double,4> realvec_t;
+  typedef realvec_t::boolvec_t boolvec_t;
+  typedef realvec_t::intvec_t intvec_t;
+  
+  realvec_t x = 1.0;
+  realvec_t y = x + realvec_t(1.0);
+  y = sqrt(y);
+  realvec_t z = log(y);
+  boolvec_t b = x < y;
+  intvec_t i = convert_int(y);
+  
+  cout << "x=" << x << "\n";
+  cout << "y=" << y << "\n";
+  cout << "z=" << z << "\n";
+  cout << "b=" << b << "\n";
+  cout << "i=" << i << "\n";
+  
+  return 0;
+}
diff --git a/floatprops.h b/floatprops.h
new file mode 100644
index 0000000..7048112
--- /dev/null
+++ b/floatprops.h
@@ -0,0 +1,152 @@
+// -*-C++-*-
+
+#ifndef FLOATPROPS_H
+#define FLOATPROPS_H
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+
+
+namespace vecmathlib {
+  
+  // A structure describing various properties of a floating point
+  // type. Most properties are already described in numeric_limits, so
+  // we inherit it.
+  template<typename real_t>
+  struct floatprops {
+    // Some interesting properties are:
+    //    digits
+    //    min_exponent
+    //    max_exponent
+  };
+  
+  template<typename int_t>
+  struct intprops {
+  };
+  
+  
+  
+  // Properties of float
+  template<>
+  struct floatprops<float>: std::numeric_limits<float> {
+    typedef float real_t;
+    typedef int32_t int_t;
+    typedef uint32_t uint_t;
+    
+    // Ensure the internal representation is what we expect
+    static_assert(is_signed, "real_t is not signed");
+    static_assert(radix==2, "real_t is not binary");
+    
+    // Ensure the sizes match
+    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+    
+    // Number of bits in internal representation
+    static int const bits = 8 * sizeof(real_t);
+    static int const mantissa_bits = digits - 1;
+    static int const sign_bits = 1;
+    static int const exponent_bits = bits - mantissa_bits - sign_bits;
+    static int const exponent_offset = 2 - min_exponent;
+    static_assert(mantissa_bits + exponent_bits + sign_bits == bits,
+                  "error in bit counts");
+    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+    static uint_t const exponent_mask =
+      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
+    static uint_t const sign_mask = uint_t(1) << (bits-1);
+    static_assert((mantissa_mask & exponent_mask & sign_mask) == uint_t(0),
+                  "error in masks");
+    static_assert((mantissa_mask | exponent_mask | sign_mask) == ~uint_t(0),
+                  "error in masks");
+    
+    // Re-interpret bit patterns
+    static inline real_t as_float(int_t x) { return *(real_t*)&x; }
+    static inline int_t as_int(real_t x) { return *(int_t*)&x; }
+    
+    // Convert values
+    static inline real_t convert_float(int_t x) { return real_t(x); }
+    static inline int_t convert_int(real_t x)
+    {
+      static_assert(sizeof(std::rint(x)) == sizeof(int_t),
+                    "rint() has wrong return type");
+      return std::rint(x);
+    }
+  };
+  
+  template<>
+  struct intprops<floatprops<float>::int_t> {
+    typedef float real_t;
+  };
+  
+  
+  
+  // Properties of double
+  template<>
+  struct floatprops<double>: std::numeric_limits<double> {
+    typedef double real_t;
+    typedef int64_t int_t;
+    typedef uint64_t uint_t;
+    
+    // Ensure the internal representation is what we expect
+    static_assert(is_signed, "real_t is not signed");
+    static_assert(radix==2, "real_t is not binary");
+    
+    // Ensure the sizes match
+    static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+    static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+    
+    // Number of bits in internal representation
+    static int const bits = 8 * sizeof(real_t);
+    static int const mantissa_bits = digits - 1;
+    static int const sign_bits = 1;
+    static int const exponent_bits = bits - mantissa_bits - sign_bits;
+    static int const exponent_offset = 2 - min_exponent;
+    static_assert(mantissa_bits + exponent_bits + sign_bits == bits,
+                  "error in bit counts");
+    static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+    static uint_t const exponent_mask =
+      ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
+    static uint_t const sign_mask = uint_t(1) << (bits-1);
+    static_assert((mantissa_mask & exponent_mask & sign_mask) == uint_t(0),
+                  "error in masks");
+    static_assert((mantissa_mask | exponent_mask | sign_mask) == ~uint_t(0),
+                  "error in masks");
+    
+    // Re-interpret bit patterns
+    static inline real_t as_float(int_t x) { return *(real_t*)&x; }
+    static inline int_t as_int(real_t x) { return *(int_t*)&x; }
+    
+    // Convert values
+    static inline real_t convert_float(int_t x) { return real_t(x); }
+    static inline int_t convert_int(real_t x)
+    {
+      static_assert(sizeof(std::llrint(x)) == sizeof(int_t),
+                    "llrint() has wrong return type");
+      return std::llrint(x);
+    }
+  };
+  
+  template<>
+  struct intprops<floatprops<double>::int_t> {
+    typedef double real_t;
+  };
+  
+  
+  
+  template<typename int_t>
+  inline typename intprops<int_t>::real_t as_float(int_t x)
+  {
+    typedef typename intprops<int_t>::real_t real_t;
+    return floatprops<real_t>::as_float(x);
+  }
+  
+  template<typename real_t>
+  inline typename floatprops<real_t>::int_t as_int(real_t x)
+  {
+    return floatprops<real_t>::as_int(x);
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef FLOATPROPS_H
diff --git a/mathfuncs.h b/mathfuncs.h
new file mode 100644
index 0000000..e62358b
--- /dev/null
+++ b/mathfuncs.h
@@ -0,0 +1,15 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_H
+#define MATHFUNCS_H
+
+#include "mathfuncs_base.h"
+
+#include "mathfuncs_asin.h"
+#include "mathfuncs_convert.h"
+#include "mathfuncs_fabs.h"
+#include "mathfuncs_log.h"
+#include "mathfuncs_rcp.h"
+#include "mathfuncs_sqrt.h"
+
+#endif  // #ifndef MATHFUNCS_H
diff --git a/mathfuncs_asin.h b/mathfuncs_asin.h
new file mode 100644
index 0000000..a8ffff4
--- /dev/null
+++ b/mathfuncs_asin.h
@@ -0,0 +1,83 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_ASIN_H
+#define MATHFUNCS_ASIN_H
+
+#include "mathfuncs_base.h"
+
+#include <cassert>
+#include <cmath>
+
+
+
+namespace vecmathlib {
+  
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_atan(realvec_t x)
+  {
+    // Handle negative values
+    realvec_t x0 = x;
+    x = fabs(x);
+    
+    // Reduce range using 1/x identity
+    assert(all(x >= RV(0.0)));
+    boolvec_t gt_one = x > RV(1.0);
+    x = ifthen(gt_one, rcp(x), x);
+    
+    // Reduce range again using half-angle formula; see
+    // <https://en.wikipedia.org/wiki/Inverse_trigonometric_functions>.
+    // This is necessary for good convergence below.
+    x = x / (RV(1.0) + sqrt(RV(1.0) + x*x));
+    
+    // Taylor expansion; see
+    // <https://en.wikipedia.org/wiki/Inverse_trigonometric_functions>.
+    assert(all(x >= RV(0.0) && x <= RV(0.5)));
+    int const nmax = 30;        // ???
+    realvec_t y = x / (RV(1.0) + x*x);
+    realvec_t x2 = x * y;
+    realvec_t r = y;
+    for (int n=3; n<nmax; n+=2) {
+      y *= RV(R(n-1) / R(n)) * x2;
+      r += y;
+    }
+    
+    // Undo second range reduction
+    r = RV(2.0) * r;
+    
+    // Undo range reduction
+    r = ifthen(gt_one, RV(M_PI_2) - r, r);
+    
+    // Handle negative values
+    r = copysign(r, x0);
+    
+    return r;
+  }
+  
+  
+  
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_acos(realvec_t x)
+  {
+    // Handle negative values
+    boolvec_t is_negative = signbit(x);
+    x = fabs(x);
+    
+    realvec_t r = RV(2.0) * atan(sqrt(RV(1.0) - x*x) / (RV(1.0) + x));
+    
+    // Handle negative values
+    r = ifthen(is_negative, RV(M_PI) - r, r);
+    
+    return r;
+  }
+  
+  
+  
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_asin(realvec_t x)
+  {
+    return RV(2.0) * atan(x / (RV(1.0) + sqrt(RV(1.0) - x*x)));
+  }
+  
+}; // namespace vecmathlib
+
+#endif  // #ifndef MATHFUNCS_ASIN_H
diff --git a/mathfuncs_base.h b/mathfuncs_base.h
new file mode 100644
index 0000000..0b9f04f
--- /dev/null
+++ b/mathfuncs_base.h
@@ -0,0 +1,67 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_BASE_H
+#define MATHFUNCS_BASE_H
+
+#include "floatprops.h"
+#include "vec_base.h"
+
+
+
+namespace vecmathlib {
+  
+  template<typename realvec_t>
+  struct mathfuncs {
+    typedef floatprops<typename realvec_t::real_t> FP;
+    
+    typedef typename FP::real_t real_t;
+    typedef typename FP::int_t int_t;
+    typedef typename FP::uint_t uint_t;
+    
+    static int const size = realvec_t::size;
+    
+    // typedef realvec<real_t, size> realvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef boolvec<real_t, size> boolvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    
+    // asin
+    static realvec_t vml_acos(realvec_t x);
+    static realvec_t vml_asin(realvec_t x);
+    static realvec_t vml_atan(realvec_t x);
+    
+    // convert
+    static realvec_t vml_convert_float(intvec_t x);
+    static intvec_t vml_convert_int(realvec_t x);
+    
+    // fabs
+    static realvec_t vml_copysign(realvec_t x, realvec_t y);
+    static realvec_t vml_fabs(realvec_t x);
+    static intvec_t vml_ilogb(realvec_t x);
+    static realvec_t vml_scalbn(realvec_t x, intvec_t n);
+    static boolvec_t vml_signbit(realvec_t x);
+    
+    // log
+    static realvec_t vml_log(realvec_t x);
+    static realvec_t vml_log10(realvec_t x);
+    static realvec_t vml_log1p(realvec_t x);
+    static realvec_t vml_log2(realvec_t x);
+    
+    // rcp
+    static realvec_t vml_rcp(realvec_t x);
+    
+    // sqrt
+    static realvec_t vml_rsqrt(realvec_t x);
+    static realvec_t vml_sqrt(realvec_t x);
+  };
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef MATHFUNCS_BASE_H
diff --git a/mathfuncs_convert.h b/mathfuncs_convert.h
new file mode 100644
index 0000000..4dfffef
--- /dev/null
+++ b/mathfuncs_convert.h
@@ -0,0 +1,86 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_CONVERT_H
+#define MATHFUNCS_CONVERT_H
+
+#include "mathfuncs_base.h"
+
+#include <cassert>
+#include <cmath>
+
+
+
+namespace vecmathlib {
+  
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_convert_float(intvec_t x)
+  {
+    // Convert in two passes. Convert as much as possible during the
+    // first pass (lobits), so that the second pass (hibits) may be
+    // omitted if the high bits are known to be zero.
+    int_t lobits = FP::mantissa_bits;
+    int_t hibits = FP::bits - lobits;
+    
+    // Convert lower bits
+    intvec_t xlo = x & IV((U(1) << lobits) - 1);
+    // exponent for the equivalent floating point number
+    int_t exponent_lo = FP::exponent_offset + lobits;
+    xlo |= exponent_lo;
+    // subtract hidden mantissa bit
+    realvec_t flo = as_float(xlo) - RV(FP::as_float(exponent_lo));
+    
+    // Convert upper bits
+    // make unsigned by subtracting largest negative number
+    x ^= FP::sign_mask;
+    intvec_t xhi = lsr(x, lobits);
+    // exponent for the equivalent floating point number
+    int_t exponent_hi = FP::exponent_offset + FP::bits;
+    xhi |= exponent_hi;
+    // add largest negative number, and subtract hidden mantissa bit
+    realvec_t
+      fhi = as_float(xhi) - RV(FP::as_float(exponent_hi) + R(FP::sign_mask));
+    
+    // Combine results
+    return flo + fhi;
+  }
+  
+  
+  
+  template<typename realvec_t>
+  auto mathfuncs<realvec_t>::vml_convert_int(realvec_t x) -> intvec_t
+  {
+    // Handle zero
+    boolvec_t is_zero = x == RV(0.0);
+    // Handle negative numbers
+    boolvec_t is_negative = signbit(x);
+    x = fabs(x);
+    
+    // Round, by adding a large number that removes the excess
+    // precision
+    int_t large = U(1) << FP::mantissa_bits;
+    x += R(large);
+    
+    intvec_t exponent = ilogb(x);
+    for (int i=0; i<intvec_t::size; ++i) {
+      assert(exponent[i] >= FP::mantissa_bits);
+    }
+    intvec_t ix = as_int(x) & IV(FP::mantissa_mask);
+    // add hidden mantissa bit
+    ix |= U(1) << FP::mantissa_bits;
+    // shift according to exponent
+    ix <<= exponent - IV(FP::mantissa_bits);
+    
+    // Undo the adding above
+    ix -= large;
+    
+    // Handle negative numbers
+    ix = ifthen(is_negative, -ix, ix);
+    // Handle zero
+    ix = ifthen(is_zero, IV(0), ix);
+    
+    return ix;
+  }
+  
+}; // namespace vecmathlib
+
+#endif  // #ifndef MATHFUNCS_CONVERT_H
diff --git a/mathfuncs_fabs.h b/mathfuncs_fabs.h
new file mode 100644
index 0000000..034fedb
--- /dev/null
+++ b/mathfuncs_fabs.h
@@ -0,0 +1,52 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_FABS_H
+#define MATHFUNCS_FABS_H
+
+#include "mathfuncs_base.h"
+
+#include <cassert>
+#include <cmath>
+
+
+
+namespace vecmathlib {
+  
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y)
+  {
+    intvec_t value = as_int(x) & IV(~FP::sign_mask);
+    intvec_t sign = as_int(y) & IV(FP::sign_mask);
+    return as_float(sign | value);
+  }
+  
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x)
+  {
+    return as_float(as_int(x) & IV(~FP::sign_mask));
+  }
+  
+  template<typename realvec_t>
+  auto mathfuncs<realvec_t>::vml_ilogb(realvec_t x) -> intvec_t
+  {
+    return
+      lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits) -
+      IV(FP::exponent_offset);
+  }
+  
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_scalbn(realvec_t x, intvec_t n)
+  {
+    return as_float(as_int(x) + (n << FP::mantissa_bits));
+    // return x * as_float((n + exponent_offset) << mantissa_bits);
+  }
+  
+  template<typename realvec_t>
+  auto mathfuncs<realvec_t>::vml_signbit(realvec_t x) -> boolvec_t
+  {
+    return convert_bool(as_int(x) & IV(FP::sign_mask));
+  }
+  
+}; // namespace vecmathlib
+
+#endif  // #ifndef MATHFUNCS_FABS_H
diff --git a/mathfuncs_log.h b/mathfuncs_log.h
new file mode 100644
index 0000000..8fb3204
--- /dev/null
+++ b/mathfuncs_log.h
@@ -0,0 +1,70 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_LOG_H
+#define MATHFUNCS_LOG_H
+
+#include "mathfuncs_base.h"
+
+#include <cassert>
+#include <cmath>
+
+
+
+namespace vecmathlib {
+  
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_log2(realvec_t x)
+  {
+    // Rescale
+    assert(all(x > RV(0.0)));
+    intvec_t ilogb_x = ilogb(x);
+    x = scalbn(x, -ilogb_x);
+    assert(all(x >= RV(1.0) && x < RV(2.0)));
+    
+    // Approximate
+    assert(all(x >= RV(1.0) && x < RV(2.0)));
+    // log(x) = Sum[n=1,nmax,n%2==1] 2/n ((x-1) / (x+1))^n
+    int const nmax = 30;
+    x *= RV(M_SQRT1_2);         // shift range to increase accuracy
+    realvec_t xm1_xp1 = (x - RV(1.0)) / (x + RV(1.0));
+    realvec_t xm1_xp1_2 = xm1_xp1 * xm1_xp1;
+    realvec_t y = RV(M_LOG2E * 2.0) * xm1_xp1;
+    realvec_t r = y;
+    for (int n=3; n<nmax; n+=2) {
+      y *= xm1_xp1_2;
+      r += y * RV(R(1.0) / R(n));
+    }
+    r += RV(0.5);               // correct result for range shift
+    
+    // Undo rescaling
+    r += convert_float(ilogb_x);
+    
+    return r;
+  }
+  
+  
+  
+  template<typename realvec_t>
+  inline
+  realvec_t mathfuncs<realvec_t>::vml_log(realvec_t x)
+  {
+    return log2(x) * RV(M_LN2);
+  }
+
+  template<typename realvec_t>
+  inline
+  realvec_t mathfuncs<realvec_t>::vml_log10(realvec_t x)
+  {
+    return log(x) * RV(M_LOG10E);
+  }
+
+  template<typename realvec_t>
+  inline
+  realvec_t mathfuncs<realvec_t>::vml_log1p(realvec_t x)
+  {
+    return log(RV(1.0) + x);
+  }
+  
+}; // namespace vecmathlib
+
+#endif  // #ifndef MATHFUNCS_LOG_H
diff --git a/mathfuncs_rcp.h b/mathfuncs_rcp.h
new file mode 100644
index 0000000..1704d8f
--- /dev/null
+++ b/mathfuncs_rcp.h
@@ -0,0 +1,48 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_RCP_H
+#define MATHFUNCS_RCP_H
+
+#include "mathfuncs_base.h"
+
+#include <cassert>
+#include <cmath>
+
+
+
+namespace vecmathlib {
+  
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_rcp(realvec_t x)
+  {
+    // Handle negative values
+    realvec_t x0 = x;
+    x = fabs(x);
+    
+    // Initial guess
+    assert(all(x > RV(0.0)));
+    intvec_t ilogb_x = ilogb(x);
+    // For stability, choose a starting value that is below the result
+    realvec_t r = scalbn(RV(0.5), -ilogb_x);
+    
+    // Iterate
+    int const nmax = 7;
+    for (int n=1; n<nmax; ++n) {
+      // Step
+      assert(all(x > RV(0.0)));
+      // Newton method:
+      // Solve   f(r) = 0   for   f(r) = x - 1/r
+      //    r <- r - f(r) / f'(r)
+      //    r <- 2 r - r^2 x
+      r *= RV(2.0) - r * x;
+    }
+    
+    // Handle negative values
+    r = copysign(r, x0);
+    
+    return r;
+  }
+  
+}; // namespace vecmathlib
+
+#endif  // #ifndef MATHFUNCS_RCP_H
diff --git a/mathfuncs_sqrt.h b/mathfuncs_sqrt.h
new file mode 100644
index 0000000..2c43478
--- /dev/null
+++ b/mathfuncs_sqrt.h
@@ -0,0 +1,69 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_SQRT_H
+#define MATHFUNCS_SQRT_H
+
+#include "mathfuncs_base.h"
+
+#include <cassert>
+#include <cmath>
+
+
+
+namespace vecmathlib {
+  
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x)
+  {
+    std::cout << "mathfuncs::sqrt\n";
+    // Handle special case: zero
+    boolvec_t is_zero = x <= RV(0.0);
+    x = ifthen(is_zero, RV(1.0), x);
+    
+    // Initial guess
+    assert(all(x > RV(0.0)));
+#if 0
+    intvec_t ilogb_x = ilogb(x);
+    realvec_t r = scalbn(RV(M_SQRT2), ilogb_x >> 1);
+    // TODO: divide by M_SQRT2 if ilogb_x % 2 == 1 ?
+#else
+    real_t correction =
+      std::scalbn(4.0/3.0, FP::exponent_offset >> 1) *
+      (FP::exponent_offset & 1 ? 1.0 / M_SQRT1_2 : 1.0);
+    realvec_t r = lsr(x.as_int(), 1).as_float() * RV(correction);
+    real_t one=1.0;
+    std::cout << "x=" << x << " r=" << r << " eo=" << FP::exponent_offset << " corr=" << correction << " ione=" << RV(one).as_int() << "\n";
+#endif
+    
+    // Iterate
+    // nmax iterations give an accuracy of 2^nmax binary digits. 5
+    // iterations suffice for double precision with its 53 digits.
+    int const nmax = 5;
+    std::cout << "bits" << FP::bits << "nmax" << nmax << "\n";
+    for (int n=1; n<nmax; ++n) {
+      // Step
+      assert(all(x > RV(0.0)));
+      // Newton method:
+      // Solve   f(r) = 0   for   f(r) = x - r^2
+      //    r <- r - f(r) / f'(r)
+      //    r <- (r + x / r) / 2
+      r = RV(0.5) * (r + x / r);
+    }
+    
+    // Handle special case: zero
+    r = ifthen(is_zero, RV(0.0), r);
+    
+    return r;
+  }
+  
+  
+  
+  template<typename realvec_t>
+  realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x)
+  {
+    return rcp(sqrt(x));
+  }
+  
+}; // namespace vecmathlib
+
+#endif  // #ifndef MATHFUNCS_SQRT_H
diff --git a/vec_base.h b/vec_base.h
new file mode 100644
index 0000000..ab4a558
--- /dev/null
+++ b/vec_base.h
@@ -0,0 +1,277 @@
+// -*-C++-*-
+
+#ifndef VEC_BASE_H
+#define VEC_BASE_H
+
+#include <iostream>
+
+namespace vecmathlib {
+  
+  template<typename real_t, int size>
+  struct boolvec {
+  };
+  
+  template<typename real_t, int size>
+  struct intvec {
+  };
+  
+  template<typename real_t, int size>
+  struct realvec {
+  };
+  
+
+  
+  // boolvec wrappers
+  
+  template<typename real_t, int size>
+  inline intvec<real_t, size> as_int(boolvec<real_t, size> x)
+  {
+    return x.as_int();
+  }
+  
+  template<typename real_t, int size>
+  inline intvec<real_t, size> convert_int(boolvec<real_t, size> x)
+  {
+    return x.convert_int();
+  }
+  
+  template<typename real_t, int size>
+  inline bool all(boolvec<real_t, size> x) { return x.all(); }
+  
+  template<typename real_t, int size>
+  inline bool any(boolvec<real_t, size> x) { return x.any(); }
+  
+  template<typename real_t, int size>
+  inline
+  intvec<real_t, size> ifthen(boolvec<real_t, size> c,
+                               intvec<real_t, size> x,
+                               intvec<real_t, size> y)
+  {
+    return c.ifthen(x, y);
+  }
+  
+  template<typename real_t, int size>
+  inline
+  realvec<real_t, size> ifthen(boolvec<real_t, size> c,
+                                 realvec<real_t, size> x,
+                                 realvec<real_t, size> y)
+  {
+    return c.ifthen(x, y);
+  }
+  
+  
+  
+  // intvec wrappers
+  
+  template<typename real_t, int size>
+  inline boolvec<real_t, size> as_bool(intvec<real_t, size> x)
+  {
+    return x.as_bool();
+  }
+  
+  template<typename real_t, int size>
+  inline boolvec<real_t, size> convert_bool(intvec<real_t, size> x)
+  {
+    return x.convert_bool();
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> as_float(intvec<real_t, size> x)
+  {
+    return x.as_float();
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> convert_float(intvec<real_t, size> x)
+  {
+    return x.convert_float();
+  }
+  
+  template<typename real_t, int size>
+  inline intvec<real_t, size> lsr(intvec<real_t, size> x,
+                                   typename intvec<real_t, size>::int_t n)
+  {
+    return x.lsr(n);
+  }
+  
+  template<typename real_t, int size>
+  inline intvec<real_t, size> lsr(intvec<real_t, size> x,
+                                   intvec<real_t, size> n)
+  {
+    return x.lsr(n);
+  }
+  
+  
+  
+  // realvec wrappers
+  
+  template<typename real_t, int size>
+  inline intvec<real_t, size> as_int(realvec<real_t, size> x)
+  {
+    return x.as_int();
+  }
+  
+  template<typename real_t, int size>
+  inline intvec<real_t, size> convert_int(realvec<real_t, size> x)
+  {
+    return x.convert_int();
+  }
+  
+  template<typename real_t, int size>
+  inline
+  typename realvec<real_t, size>::real_t prod(realvec<real_t, size> x)
+  {
+    return x.prod();
+  }
+  
+  template<typename real_t, int size>
+  inline
+  typename realvec<real_t, size>::real_t sum(realvec<real_t, size> x)
+  {
+    return x.sum();
+  }
+  
+  
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> copysign(realvec<real_t, size> x,
+                                          realvec<real_t, size> y)
+  {
+    return x.copysign(y);
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> fabs(realvec<real_t, size> x)
+  {
+    return x.fabs();
+  }
+  
+  template<typename real_t, int size>
+  inline intvec<real_t, size> ilogb(realvec<real_t, size> x)
+  {
+    return x.ilogb();
+  }
+  
+  template<typename real_t, int size>
+  inline
+  realvec<real_t, size> scalbn(realvec<real_t, size> x,
+                                 intvec<real_t, size> n)
+  {
+    return x.scalbn(n);
+  }
+  
+  template<typename real_t, int size>
+  inline boolvec<real_t, size> signbit(realvec<real_t, size> x)
+  {
+    return x.signbit();
+  }
+  
+  
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> acos(realvec<real_t, size> x)
+  {
+    return x.acos();
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> asin(realvec<real_t, size> x)
+  {
+    return x.asin();
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> atan(realvec<real_t, size> x)
+  {
+    return x.atan();
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> floor(realvec<real_t, size> x)
+  {
+    return x.floor();
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> log(realvec<real_t, size> x)
+  {
+    return x.log();
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> log10(realvec<real_t, size> x)
+  {
+    return x.log10();
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> log1p(realvec<real_t, size> x)
+  {
+    return x.log1p();
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> log2(realvec<real_t, size> x)
+  {
+    return x.log2();
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> rcp(realvec<real_t, size> x)
+  {
+    return x.rcp();
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> rsqrt(realvec<real_t, size> x)
+  {
+    return x.rsqrt();
+  }
+  
+  template<typename real_t, int size>
+  inline realvec<real_t, size> sqrt(realvec<real_t, size> x)
+  {
+    return x.sqrt();
+  }
+  
+  
+  
+  template<typename real_t, int size>
+  std::ostream& operator<<(std::ostream& os, boolvec<real_t, size> const& x)
+  {
+    os << "[";
+    for (int i=0; i<size; ++i) {
+      if (i!=0) os << ",";
+      os << x[i];
+    }
+    os << "]";
+    return os;
+  }
+  
+  template<typename real_t, int size>
+  std::ostream& operator<<(std::ostream& os, intvec<real_t, size> const& x)
+  {
+    os << "[";
+    for (int i=0; i<size; ++i) {
+      if (i!=0) os << ",";
+      os << x[i];
+    }
+    os << "]";
+    return os;
+  }
+  
+  template<typename real_t, int size>
+  std::ostream& operator<<(std::ostream& os, realvec<real_t, size> const& x)
+  {
+    os << "[";
+    for (int i=0; i<size; ++i) {
+      if (i!=0) os << ",";
+      os << x[i];
+    }
+    os << "]";
+    return os;
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_BASE_H
diff --git a/vec_double_avx.h b/vec_double_avx.h
new file mode 100644
index 0000000..258ffd0
--- /dev/null
+++ b/vec_double_avx.h
@@ -0,0 +1,588 @@
+// -*-C++-*-
+
+#ifndef VEC_DOUBLE_AVX_H
+#define VEC_DOUBLE_AVX_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// AVX intrinsics
+#include <immintrin.h>
+
+
+
+namespace vecmathlib {
+  
+  template<> struct boolvec<double,4>;
+  template<> struct intvec<double,4>;
+  template<> struct realvec<double,4>;
+  
+  
+  
+  template<>
+  struct boolvec<double,4>: floatprops<double>
+  {
+    static const int size = 4;
+    typedef bool scalar_t;
+    typedef __m256d bvector_t;
+    
+    static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+                  "vector size is wrong");
+    
+  private:
+    // true values have the sign bit set, false values have it unset
+    static uint_t from_bool(bool a) { return - uint_t(a); }
+    static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+  public:
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    boolvec(boolvec const& x): v(x.v) {}
+    boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    boolvec(bvector_t x): v(x) {}
+    boolvec(bool a):
+    v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {}
+    boolvec(bool const* as):
+    v(_mm256_castsi256_pd(_mm256_set_epi64x(from_bool(as[3]),
+                                            from_bool(as[2]),
+                                            from_bool(as[1]),
+                                            from_bool(as[0])))) {}
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+    boolvec& set_elt(int n, bool a)
+    {
+      return ((int_t*)&v)[n] = from_bool(a), *this;
+    }
+    
+    
+    
+    auto as_int() const -> intvec_t;      // defined after intvec
+    auto convert_int() const -> intvec_t; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return _mm256_xor_pd(boolvec(true), v); }
+    
+    boolvec operator&&(boolvec x) const { return _mm256_and_pd(v, x.v); }
+    boolvec operator||(boolvec x) const { return _mm256_or_pd(v, x.v); }
+    boolvec operator==(boolvec x) const { return !(*this==x); }
+    boolvec operator!=(boolvec x) const { return _mm256_xor_pd(v, x.v); }
+    
+    bool all() const
+    {
+      return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+    }
+    bool any() const
+    {
+      return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+    }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    auto ifthen(intvec_t x,
+                intvec_t y) const -> intvec_t; // defined after intvec
+    auto ifthen(realvec_t x,
+                realvec_t y) const -> realvec_t; // defined after realvec
+    
+  };
+  
+  
+  
+  template<>
+  struct intvec<double,4>: floatprops<double>
+  {
+    static const int size = 4;
+    typedef int_t scalar_t;
+    typedef __m256i ivector_t;
+    
+    static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    intvec(intvec const& x): v(x.v) {}
+    intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    intvec(ivector_t x): v(x) {}
+    intvec(int_t a): v(_mm256_set1_epi64x(a)) {}
+    intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {}
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+    intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+    
+    
+    
+    auto as_bool() const -> boolvec_t { return _mm256_castsi256_pd(v); }
+    auto convert_bool() const -> boolvec_t
+    {
+      // Result: convert_bool(0)=false, convert_bool(else)=true
+      // There is no intrinsic to compare with zero. Instead, we check
+      // whether x is positive and x-1 is negative.
+      intvec x = *this;
+      intvec xm1 = x - 1;
+      // We know that boolvec values depend only on the sign bit
+      // return (~xm1 | x).as_bool();
+      return x.as_bool() || !xm1.as_bool();
+    }
+    auto as_float() const -> realvec_t; // defined after realvec
+    auto convert_float() const -> realvec_t; // defined after realvec
+    
+    
+    
+    // Note: not all arithmetic operations are supported!
+    
+    intvec operator+() const { return *this; }
+    intvec operator-() const
+    {
+      __m128i v01 = _mm256_castsi256_si128(v);
+      __m128i v23 = _mm256_extractf128_si256(v, 1);
+      v01 = _mm_sub_epi64(_mm_set1_epi64x(0), v01);
+      v23 = _mm_sub_epi64(_mm_set1_epi64x(0), v23);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+    }
+    
+    intvec operator+(intvec x) const
+    {
+      __m128i v01 = _mm256_castsi256_si128(v);
+      __m128i v23 = _mm256_extractf128_si256(v, 1);
+      __m128i xv01 = _mm256_castsi256_si128(x.v);
+      __m128i xv23 = _mm256_extractf128_si256(x.v, 1);
+      v01 = _mm_add_epi64(v01, xv01);
+      v23 = _mm_add_epi64(v23, xv23);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+    }
+    intvec operator-(intvec x) const
+    {
+      __m128i v01 = _mm256_castsi256_si128(v);
+      __m128i v23 = _mm256_extractf128_si256(v, 1);
+      __m128i xv01 = _mm256_castsi256_si128(x.v);
+      __m128i xv23 = _mm256_extractf128_si256(x.v, 1);
+      v01 = _mm_sub_epi64(v01, xv01);
+      v23 = _mm_sub_epi64(v23, xv23);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+    }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+    
+    
+    
+    intvec operator~() const
+    {
+      return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(IV(~U(0))),
+                                               _mm256_castsi256_pd(v)));
+    }
+    
+    intvec operator&(intvec x) const
+    {
+      return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(v),
+                                               _mm256_castsi256_pd(x.v)));
+    }
+    intvec operator|(intvec x) const
+    {
+      return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(v),
+                                              _mm256_castsi256_pd(x.v)));
+    }
+    intvec operator^(intvec x) const
+    {
+      return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(v),
+                                               _mm256_castsi256_pd(x.v)));
+    }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    // SSE2 shift instructions potentially relevant for 64-bit
+    // operations:
+    //    _mm_slli_epi64
+    //    _mm_srli_epi64
+    //    _mm_sll_epi64
+    //    _mm_srl_epi64
+    //    _mm_srai_epi32
+    //    _mm_sra_epi32
+    //    _mm_srli_si128
+    //    _mm_slli_si128
+    intvec lsr(int_t n) const
+    {
+      __m128i v01 = _mm256_castsi256_si128(v);
+      __m128i v23 = _mm256_extractf128_si256(v, 1);
+      v01 = _mm_srli_epi64(v01, n);
+      v23 = _mm_srli_epi64(v23, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+    }
+    intvec operator>>(int_t n) const
+    {
+      __m128i v01 = _mm256_castsi256_si128(v);
+      __m128i v23 = _mm256_extractf128_si256(v, 1);
+      // There is no _mm_srai_epi64. To emulate it, add 0x8000000
+      // before shifting, and subtracted the shifted 0x80000000 after shifting
+#if 0
+      __m128i signmask01 = _mm_sub_epi64(_mm_set1_epi64x(0),
+                                         _mm_srli_epi64(v01, 63));
+      __m128i signmask23 = _mm_sub_epi64(_mm_set1_epi64x(0),
+                                         _mm_srli_epi64(v23, 63));
+      v01 = _mm_xor_si128(signmask01, v01);
+      v23 = _mm_xor_si128(signmask23, v23);
+      v01 = _mm_srli_epi64(v01, n);
+      v23 = _mm_srli_epi64(v23, n);
+      v01 = _mm_xor_si128(signmask01, v01);
+      v23 = _mm_xor_si128(signmask23, v23);
+#else
+      // Convert signed to unsiged
+      v01 += _mm_set1_epi64x(U(1) << (bits-1));
+      v23 += _mm_set1_epi64x(U(1) << (bits-1));
+      // Shift
+      v01 = _mm_srli_epi64(v01, n);
+      v23 = _mm_srli_epi64(v23, n);
+      // Undo conversion
+      v01 -= _mm_set1_epi64x(U(1) << (bits-n));
+      v23 -= _mm_set1_epi64x(U(1) << (bits-n));
+#endif
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+    }
+    intvec operator<<(int_t n) const
+    {
+      __m128i v01 = _mm256_castsi256_si128(v);
+      __m128i v23 = _mm256_extractf128_si256(v, 1);
+      v01 = _mm_slli_epi64(v01, n);
+      v23 = _mm_slli_epi64(v23, n);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+    }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const
+    {
+      __m128i v01 = _mm256_castsi256_si128(v);
+      __m128i v23 = _mm256_extractf128_si256(v, 1);
+      __m128i nv01 = _mm256_castsi256_si128(n.v);
+      __m128i nv23 = _mm256_extractf128_si256(n.v, 1);
+      v01 = _mm_srl_epi64(v01, nv01);
+      v23 = _mm_srl_epi64(v23, nv23);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+    }
+    intvec operator>>(intvec n) const
+    {
+      __m128i v01 = _mm256_castsi256_si128(v);
+      __m128i v23 = _mm256_extractf128_si256(v, 1);
+      __m128i nv01 = _mm256_castsi256_si128(n.v);
+      __m128i nv23 = _mm256_extractf128_si256(n.v, 1);
+#if 0
+      // There is no _mm_srai_epi64. To emulate it, invert all bits
+      // before and after shifting if the sign bit is set.
+      __m128i signmask01 = _mm_sub_epi64(_mm_set1_epi64x(0),
+                                         _mm_srli_epi64(v01, 63));
+      __m128i signmask23 = _mm_sub_epi64(_mm_set1_epi64x(0),
+                                         _mm_srli_epi64(v23, 63));
+      v01 = _mm_xor_si128(signmask01, v01);
+      v23 = _mm_xor_si128(signmask23, v23);
+      v01 = _mm_srl_epi64(v01, nv01);
+      v23 = _mm_srl_epi64(v23, nv23);
+      v01 = _mm_xor_si128(signmask01, v01);
+      v23 = _mm_xor_si128(signmask23, v23);
+#else
+      // Convert signed to unsiged
+      v01 += _mm_set1_epi64x(U(1) << (bits-1));
+      v23 += _mm_set1_epi64x(U(1) << (bits-1));
+      // Shift
+      v01 = _mm_srl_epi64(v01, nv01);
+      v23 = _mm_srl_epi64(v23, nv23);
+      // Undo conversion
+      v01 -= _mm_sll_epi64(_mm_set1_epi64x(1),
+                           _mm_sub_epi64(_mm_set1_epi64x(bits), nv01));
+      v23 -= _mm_sll_epi64(_mm_set1_epi64x(1),
+                           _mm_sub_epi64(_mm_set1_epi64x(bits), nv23));
+#endif
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+    }
+    intvec operator<<(intvec n) const
+    {
+      __m128i v01 = _mm256_castsi256_si128(v);
+      __m128i v23 = _mm256_extractf128_si256(v, 1);
+      __m128i nv01 = _mm256_castsi256_si128(n.v);
+      __m128i nv23 = _mm256_extractf128_si256(n.v, 1);
+      v01 = _mm_sll_epi64(v01, nv01);
+      v23 = _mm_sll_epi64(v23, nv23);
+      return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+    }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+  };
+  
+  
+  
+  template<>
+  struct realvec<double,4>: floatprops<double>
+  {
+    static const int size = 4;
+    typedef real_t scalar_t;
+    typedef __m256d vector_t;
+    
+    static_assert(size * sizeof(real_t) == sizeof(vector_t),
+                  "vector size is wrong");
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    realvec(realvec const& x): v(x.v) {}
+    realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(_mm256_set1_pd(a)) {}
+    realvec(real_t const* as): v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {}
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+    realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+    
+    
+    
+    intvec_t as_int() const { return _mm256_castpd_si256(v); }
+    intvec_t convert_int() const
+    {
+#if 0
+      __m128i iv0123 = _mm256_cvtpd_epi32(v);
+      __m128i iv2301 = _mm_shuffle_ps(iv0123, iv0123, 0b10110001);
+      __m256i iv01232301 =
+        _mm256_insertf128_si256(_mm256_castsi128_si256(iv0123), iv2301, 1);
+      __m256i zero = _mm256_setzero_ps();
+      return _mm256_unpacklo_ps(iv01232301, zero);
+#else
+      realvec x = _mm256_floor_pd(v);
+      intvec_t ix = x.as_int();
+      boolvec_t sign = ix.as_bool();
+      intvec_t exponent = (ix & exponent_mask) >> mantissa_bits;
+      ix &= mantissa_mask;
+      ix |= U(1) << mantissa_bits;  // add hidden bit
+      ix <<= exponent - exponent_offset + 52; // ???
+      ix = ifthen(sign, -ix, ix);
+      return ix;
+#endif
+    }
+    
+    
+    
+    realvec operator+() const { return *this; }
+    realvec operator-() const { return _mm256_sub_pd(_mm256_set1_pd(0.0), v); }
+    
+    realvec operator+(realvec x) const { return _mm256_add_pd(v, x.v); }
+    realvec operator-(realvec x) const { return _mm256_sub_pd(v, x.v); }
+    realvec operator*(realvec x) const { return _mm256_mul_pd(v, x.v); }
+    realvec operator/(realvec x) const { return _mm256_div_pd(v, x.v); }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const
+    {
+      return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+    }
+    real_t sum() const
+    {
+      return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+    }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const
+    {
+      return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ);
+    }
+    boolvec_t operator!=(realvec const& x) const
+    {
+      return _mm256_cmp_pd(v, x.v, _CMP_NEQ_OQ);
+    }
+    boolvec_t operator<(realvec const& x) const
+    {
+      return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ);
+    }
+    boolvec_t operator<=(realvec const& x) const
+    {
+      return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ);
+    }
+    boolvec_t operator>(realvec const& x) const
+    {
+      return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ);
+    }
+    boolvec_t operator>=(realvec const& x) const
+    {
+      return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ);
+    }
+    
+    
+    
+#if 0
+    realvec copysign(realvec y) const
+    {
+      uint_t signmask = U(1) << (bits-1);
+      intvec_t value = as_int() & IV(~signmask);
+      intvec_t sign = y.as_int() & IV(signmask);
+      return (sign | value).as_float();
+    }
+    
+    realvec fabs() const
+    {
+      uint_t signmask = U(1) << (bits-1);
+      return (as_int() & IV(~signmask)).as_float();
+    }
+    
+    intvec_t ilogb() const
+    {
+      intvec_t exponent_mask =
+        ((U(1) << exponent_bits) - U(1)) << mantissa_bits;
+      return lsr(as_int() & exponent_mask, mantissa_bits) - IV(exponent_offset);
+    }
+    
+    realvec scalbn(intvec_t n) const
+    {
+      return *this * ((n + exponent_offset) << mantissa_bits).as_float();
+    }
+    
+    boolvec_t signbit() const
+    {
+      return v;
+    }
+#endif
+    
+    realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+    realvec fabs() const { return MF::vml_fabs(*this); }
+    intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+    realvec scalbn(intvec_t n) const { return MF::vml_scalbn(*this, n); }
+    boolvec_t signbit() const { return v; }
+    
+    
+    
+    realvec acos() const { return MF::vml_acos(*this); }
+    realvec asin() const { return MF::vml_asin(*this); }
+    realvec atan() const { return MF::vml_atan(*this); }
+    realvec floor() const { return _mm256_floor_pd(v); }
+    realvec log() const { return MF::vml_log(*this); }
+    realvec log10() const { return MF::vml_log10(*this); }
+    realvec log1p() const { return MF::vml_log1p(*this); }
+    realvec log2() const { return MF::vml_log2(*this); }
+    realvec rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); }
+    // realvec rcp() const { return MF::vml_rcp(*this); }
+    realvec rsqrt() const { return MF::vml_rsqrt(*this); }
+    realvec sqrt() const { return _mm256_sqrt_pd(v); }
+    // realvec sqrt() const { return MF::vml_sqrt(*this); }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<double,4>::as_int() const -> intvec_t
+  {
+    return _mm256_castpd_si256(v);
+  }
+  
+  inline
+  auto boolvec<double,4>::convert_int() const -> intvec_t
+  {
+    //return ifthen(v, U(1), U(0));
+    return lsr(as_int(), bits-1);
+  }
+  
+  inline
+  auto boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return ifthen(x.as_float(), y.as_float()).as_int();
+  }
+  
+  inline
+  auto boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return _mm256_blendv_pd(y.v, x.v, v);
+  }
+
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<double,4>::as_float() const -> realvec_t
+  {
+    return _mm256_castsi256_pd(v);
+  }
+  
+  inline auto intvec<double,4>::convert_float() const -> realvec_t
+  {
+    intvec x = v;
+    uint_t signbitmask = U(1) << (bits-1);
+    // make unsigned
+    x += signbitmask;
+    // convert lower 52 bits
+    intvec xlo = x & IV((U(1) << mantissa_bits) - 1);
+    int_t exponent_0 = exponent_offset + mantissa_bits;
+    xlo = xlo | exponent_0;
+    realvec_t flo = xlo.as_float() - FP::as_float(exponent_0);
+    // convert upper 22 bits
+    intvec xhi = x.lsr(U(mantissa_bits));
+    int_t exponent_52 = exponent_0 + mantissa_bits;
+    xhi = xhi | exponent_52;
+    realvec_t fhi = xhi.as_float() - FP::as_float(exponent_52);
+    return flo + fhi - R(signbitmask);
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_DOUBLE_AVX_H
diff --git a/vec_float.h b/vec_float.h
new file mode 100644
index 0000000..6025ab7
--- /dev/null
+++ b/vec_float.h
@@ -0,0 +1,305 @@
+// -*-C++-*-
+
+#ifndef VEC_FLOAT_H
+#define VEC_FLOAT_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+
+
+namespace vecmathlib {
+  
+  template<> struct boolvec<float,1>;
+  template<> struct intvec<float,1>;
+  template<> struct realvec<float,1>;
+  
+  
+  
+  template<>
+  struct boolvec<float,1>: floatprops<float>
+  {
+    static const int size = 1;
+    typedef bool scalar_t;
+    typedef bool bvector_t;
+    
+    typedef boolvec boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    bvector_t v;
+    
+    boolvec() {}
+    boolvec(boolvec const& x): v(x.v) {}
+    boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+    //boolvec(vector_t x): v(x) {}
+    boolvec(bool a): v(a) {}
+    boolvec(bool const* as): v(as[0]) {}
+    
+    operator bvector_t() const { return v; }
+    bool operator[](int n) const { return v; }
+    boolvec& set_elt(int n, bool a) { return v=a, *this; }
+    
+    
+    
+    auto as_int() const -> intvec_t;      // defined after intvec
+    auto convert_int() const -> intvec_t; // defined after intvec
+    
+    
+    
+    boolvec operator!() const { return !v; }
+    
+    boolvec operator&&(boolvec x) const { return v&&x.v; }
+    boolvec operator||(boolvec x) const { return v||x.v; }
+    boolvec operator==(boolvec x) const { return v==x.v; }
+    boolvec operator!=(boolvec x) const { return v!=x.v; }
+    
+    bool all() const { return v; }
+    bool any() const { return v; }
+    
+    
+    
+    // ifthen(condition, then-value, else-value)
+    auto ifthen(intvec_t x,
+                intvec_t y) const -> intvec_t; // defined after intvec
+    auto ifthen(realvec_t x,
+                realvec_t y) const -> realvec_t; // defined after realvec
+    
+  };
+  
+  
+  
+  template<>
+  struct intvec<float,1>: floatprops<float>
+  {
+    static const int size = 1;
+    typedef int_t scalar_t;
+    typedef int_t ivector_t;
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec intvec_t;
+    typedef realvec<real_t, size> realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    ivector_t v;
+    
+    intvec() {}
+    intvec(intvec const& x): v(x.v) {}
+    intvec& operator=(intvec const& x) { return v=x.v, *this; }
+    //intvec(vector_t x): v(x) {}
+    intvec(int_t a): v(a) {}
+    intvec(int_t const* as): v(as[0]) {}
+    
+    operator ivector_t() const { return v; }
+    int_t operator[](int n) const { return v; }
+    intvec& set_elt(int n, int_t a) { return v=a, *this; }
+    
+    
+    
+    auto as_bool() const -> boolvec_t { return *(bool const*)&v; }
+    auto convert_bool() const -> boolvec_t { return v; }
+    auto as_float() const -> realvec_t; // defined after realvec
+    auto convert_float() const -> realvec_t; // defined after realvec
+    
+    
+    
+    // Note: not all arithmetic operations are supported!
+    
+    intvec operator+() const { return +v; }
+    intvec operator-() const { return -v; }
+    
+    intvec operator+(intvec x) const { return v+x.v; }
+    intvec operator-(intvec x) const { return v-x.v; }
+    
+    intvec& operator+=(intvec const& x) { return *this=*this+x; }
+    intvec& operator-=(intvec const& x) { return *this=*this-x; }
+
+
+    
+    intvec operator~() const { return ~v; }
+    
+    intvec operator&(intvec x) const { return v&x.v; }
+    intvec operator|(intvec x) const { return v|x.v; }
+    intvec operator^(intvec x) const { return v^x.v; }
+    
+    intvec& operator&=(intvec const& x) { return *this=*this&x; }
+    intvec& operator|=(intvec const& x) { return *this=*this|x; }
+    intvec& operator^=(intvec const& x) { return *this=*this^x; }
+    
+    
+    
+    intvec lsr(int_t n) const { return U(v)>>n; }
+    intvec operator>>(int_t n) const { return v>>n; }
+    intvec operator<<(int_t n) const { return v<<n; }
+    intvec& operator>>=(int_t n) { return *this=*this>>n; }
+    intvec& operator<<=(int_t n) { return *this=*this<<n; }
+    
+    intvec lsr(intvec n) const { return U(v)>>n; }
+    intvec operator>>(intvec n) const { return v>>n; }
+    intvec operator<<(intvec n) const { return v<<n; }
+    intvec& operator>>=(intvec n) { return *this=*this>>n; }
+    intvec& operator<<=(intvec n) { return *this=*this<<n; }
+  };
+  
+  
+  
+  template<>
+  struct realvec<float,1>: floatprops<float>
+  {
+    static int const size = 1;
+    typedef real_t scalar_t;
+    typedef real_t vector_t;
+    
+    typedef boolvec<real_t, size> boolvec_t;
+    typedef intvec<real_t, size> intvec_t;
+    typedef realvec realvec_t;
+    
+    // Short names for type casts
+    typedef real_t R;
+    typedef int_t I;
+    typedef uint_t U;
+    typedef realvec_t RV;
+    typedef intvec_t IV;
+    typedef boolvec_t BV;
+    typedef floatprops<real_t> FP;
+    typedef mathfuncs<realvec_t> MF;
+    
+    
+    
+    vector_t v;
+    
+    realvec() {}
+    realvec(realvec const& x): v(x.v) {}
+    realvec& operator=(realvec const& x) { return v=x.v, *this; }
+    //realvec(vector_t x): v(x) {}
+    realvec(real_t a): v(a) {}
+    realvec(real_t const* as): v(as[0]) {}
+    
+    operator vector_t() const { return v; }
+    real_t operator[](int n) const { return v; }
+    realvec& set_elt(int n, real_t a) { return v=a, *this; }
+    
+    
+    
+    intvec_t as_int() const { return FP::as_int(v); }
+    intvec_t convert_int() const { return MF::vml_convert_int(v); }
+    
+    
+    
+    realvec operator+() const { return +v; }
+    realvec operator-() const { return -v; }
+    
+    realvec operator+(realvec x) const { return v+x.v; }
+    realvec operator-(realvec x) const { return v-x.v; }
+    realvec operator*(realvec x) const { return v*x.v; }
+    realvec operator/(realvec x) const { return v/x.v; }
+    
+    realvec& operator+=(realvec const& x) { return *this=*this+x; }
+    realvec& operator-=(realvec const& x) { return *this=*this-x; }
+    realvec& operator*=(realvec const& x) { return *this=*this*x; }
+    realvec& operator/=(realvec const& x) { return *this=*this/x; }
+    
+    real_t prod() const { return v; }
+    real_t sum() const { return v; }
+    
+    
+    
+    boolvec_t operator==(realvec const& x) const { return v==x.v; }
+    boolvec_t operator!=(realvec const& x) const { return v!=x.v; }
+    boolvec_t operator<(realvec const& x) const { return v<x.v; }
+    boolvec_t operator<=(realvec const& x) const { return v<=x.v; }
+    boolvec_t operator>(realvec const& x) const { return v>x.v; }
+    boolvec_t operator>=(realvec const& x) const { return v>=x.v; }
+    
+    
+    
+    realvec copysign(realvec x) const { return MF::vml_copysign(v, x.v); }
+    realvec fabs() const { return MF::vml_fabs(v); }
+    intvec_t ilogb() const { return MF::vml_ilogb(v); }
+    realvec scalbn(intvec_t n) const { return MF::vml_scalbn(v, n.v); }
+    boolvec_t signbit() const { return MF::vml_signbit(v); }
+    
+    realvec acos() const { return MF::vml_acos(*this); }
+    realvec asin() const { return MF::vml_asin(*this); }
+    realvec atan() const { return MF::vml_atan(*this); }
+    realvec floor() const { return std::floor(v); }
+    realvec log() const { return MF::vml_log(*this); }
+    realvec log10() const { return MF::vml_log10(*this); }
+    realvec log1p() const { return MF::vml_log1p(*this); }
+    realvec log2() const { return MF::vml_log2(*this); }
+    realvec rcp() const { return MF::vml_rcp(*this); }
+    realvec rsqrt() const { return MF::vml_rsqrt(*this); }
+    realvec sqrt() const { return MF::vml_sqrt(*this); }
+  };
+  
+  
+  
+  // boolvec definitions
+  
+  inline
+  auto boolvec<float,1>::as_int() const -> intvec_t
+  {
+    return v;
+  }
+  
+  inline
+  auto boolvec<float,1>::convert_int() const -> intvec_t
+  {
+    return v;
+  }
+  
+  inline
+  auto boolvec<float,1>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+  {
+    return v ? x.v : y.v;
+  }
+  
+  inline
+  auto boolvec<float,1>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+  {
+    return v ? x.v : y.v;
+  }
+
+  
+  
+  // intvec definitions
+  
+  inline auto intvec<float,1>::as_float() const -> realvec_t
+  {
+    return FP::as_float(v);
+  }
+  
+  inline auto intvec<float,1>::convert_float() const -> realvec_t
+  {
+    return MF::vml_convert_float(v);
+  }
+  
+} // namespace vecmathlib
+
+#endif  // #ifndef VEC_FLOAT_H
diff --git a/vecmathlib.h b/vecmathlib.h
new file mode 100644
index 0000000..2264545
--- /dev/null
+++ b/vecmathlib.h
@@ -0,0 +1,11 @@
+// -*-C++-*-
+
+#ifndef VECMATHLIB_H
+#define VECMATHLIB_H
+
+#include "vec_float.h"
+#if defined __AVX__             // Intel AVX
+#  include "vec_double_avx.h"
+#endif
+
+#endif // #ifndef VECMATHLIB_H
author	Erik Schnetter <schnetter@gmail.com>	2012-11-30 15:50:03 -0500
committer	Erik Schnetter <schnetter@gmail.com>	2012-11-30 15:50:03 -0500
commit	d2614759a1d542c41af59b04d8711246d2a1e876 (patch)
tree	2689209034d9ccc3d29208d50b82b4f89116aa1b
download	vecmathlib-d2614759a1d542c41af59b04d8711246d2a1e876.zip vecmathlib-d2614759a1d542c41af59b04d8711246d2a1e876.tar.gz