summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorErik Schnetter <schnetter@gmail.com>2012-11-30 15:50:03 -0500
committerErik Schnetter <schnetter@gmail.com>2012-11-30 15:50:03 -0500
commitd2614759a1d542c41af59b04d8711246d2a1e876 (patch)
tree2689209034d9ccc3d29208d50b82b4f89116aa1b
downloadvecmathlib-d2614759a1d542c41af59b04d8711246d2a1e876.zip
vecmathlib-d2614759a1d542c41af59b04d8711246d2a1e876.tar.gz
Import initial version
-rw-r--r--IDEAS7
-rw-r--r--build.ninja33
-rw-r--r--empty.cc0
-rw-r--r--example.cc32
-rw-r--r--floatprops.h152
-rw-r--r--mathfuncs.h15
-rw-r--r--mathfuncs_asin.h83
-rw-r--r--mathfuncs_base.h67
-rw-r--r--mathfuncs_convert.h86
-rw-r--r--mathfuncs_fabs.h52
-rw-r--r--mathfuncs_log.h70
-rw-r--r--mathfuncs_rcp.h48
-rw-r--r--mathfuncs_sqrt.h69
-rw-r--r--vec_base.h277
-rw-r--r--vec_double_avx.h588
-rw-r--r--vec_float.h305
-rw-r--r--vecmathlib.h11
17 files changed, 1895 insertions, 0 deletions
diff --git a/IDEAS b/IDEAS
new file mode 100644
index 0000000..6a2f385
--- /dev/null
+++ b/IDEAS
@@ -0,0 +1,7 @@
+implement cosh, sinh via expansion, similar to asin
+ https://en.wikipedia.org/wiki/Hyperbolic_function
+
+implement exp via cosh, sinh
+
+implement acosh, asinh via expansion, similar to asin
+ https://en.wikipedia.org/wiki/Inverse_hyperbolic_function
diff --git a/build.ninja b/build.ninja
new file mode 100644
index 0000000..fbfd02b
--- /dev/null
+++ b/build.ninja
@@ -0,0 +1,33 @@
+ar = ar
+cxx = g++
+
+cppflags =
+cxxflags = -std=gnu++11 -Wall -g -march=native
+# -Ofast
+ldflags = -L.
+
+rule cxx
+ command = $cxx $cppflags -MMD -MT $out -MF $out.d $cxxflags -c $in -o $out
+ description = CXX $out
+ depfile = $out.d
+
+rule ar
+ command = rm -f $out && $ar crs $out $in && ranlib $out
+ description = AR $out
+
+rule link
+ command = $cxx $cppflags $cxxflags $ldflags -o $out $in $libs
+ description = LINK $out
+
+build empty.o: cxx empty.cc
+build example.o: cxx example.cc
+build test.o: cxx test.cc
+
+build libvecmath.a: ar empty.o
+
+build example: link example.o | libvecmath.a
+ libs = -lvecmath
+build test: link test.o | libvecmath.a
+ libs = -lvecmath
+
+default example test
diff --git a/empty.cc b/empty.cc
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/empty.cc
diff --git a/example.cc b/example.cc
new file mode 100644
index 0000000..50415c2
--- /dev/null
+++ b/example.cc
@@ -0,0 +1,32 @@
+// -*-C++-*-
+
+#include "vec_float.h"
+#include "vec_double_avx.h"
+
+#include <iostream>
+
+using namespace std;
+
+int main(int argc, char** argv)
+{
+ using namespace vecmathlib;
+ typedef realvec<float,1> realvec_t;
+ // typedef realvec<double,4> realvec_t;
+ typedef realvec_t::boolvec_t boolvec_t;
+ typedef realvec_t::intvec_t intvec_t;
+
+ realvec_t x = 1.0;
+ realvec_t y = x + realvec_t(1.0);
+ y = sqrt(y);
+ realvec_t z = log(y);
+ boolvec_t b = x < y;
+ intvec_t i = convert_int(y);
+
+ cout << "x=" << x << "\n";
+ cout << "y=" << y << "\n";
+ cout << "z=" << z << "\n";
+ cout << "b=" << b << "\n";
+ cout << "i=" << i << "\n";
+
+ return 0;
+}
diff --git a/floatprops.h b/floatprops.h
new file mode 100644
index 0000000..7048112
--- /dev/null
+++ b/floatprops.h
@@ -0,0 +1,152 @@
+// -*-C++-*-
+
+#ifndef FLOATPROPS_H
+#define FLOATPROPS_H
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+
+
+namespace vecmathlib {
+
+ // A structure describing various properties of a floating point
+ // type. Most properties are already described in numeric_limits, so
+ // we inherit it.
+ template<typename real_t>
+ struct floatprops {
+ // Some interesting properties are:
+ // digits
+ // min_exponent
+ // max_exponent
+ };
+
+ template<typename int_t>
+ struct intprops {
+ };
+
+
+
+ // Properties of float
+ template<>
+ struct floatprops<float>: std::numeric_limits<float> {
+ typedef float real_t;
+ typedef int32_t int_t;
+ typedef uint32_t uint_t;
+
+ // Ensure the internal representation is what we expect
+ static_assert(is_signed, "real_t is not signed");
+ static_assert(radix==2, "real_t is not binary");
+
+ // Ensure the sizes match
+ static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+ static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+ // Number of bits in internal representation
+ static int const bits = 8 * sizeof(real_t);
+ static int const mantissa_bits = digits - 1;
+ static int const sign_bits = 1;
+ static int const exponent_bits = bits - mantissa_bits - sign_bits;
+ static int const exponent_offset = 2 - min_exponent;
+ static_assert(mantissa_bits + exponent_bits + sign_bits == bits,
+ "error in bit counts");
+ static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+ static uint_t const exponent_mask =
+ ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
+ static uint_t const sign_mask = uint_t(1) << (bits-1);
+ static_assert((mantissa_mask & exponent_mask & sign_mask) == uint_t(0),
+ "error in masks");
+ static_assert((mantissa_mask | exponent_mask | sign_mask) == ~uint_t(0),
+ "error in masks");
+
+ // Re-interpret bit patterns
+ static inline real_t as_float(int_t x) { return *(real_t*)&x; }
+ static inline int_t as_int(real_t x) { return *(int_t*)&x; }
+
+ // Convert values
+ static inline real_t convert_float(int_t x) { return real_t(x); }
+ static inline int_t convert_int(real_t x)
+ {
+ static_assert(sizeof(std::rint(x)) == sizeof(int_t),
+ "rint() has wrong return type");
+ return std::rint(x);
+ }
+ };
+
+ template<>
+ struct intprops<floatprops<float>::int_t> {
+ typedef float real_t;
+ };
+
+
+
+ // Properties of double
+ template<>
+ struct floatprops<double>: std::numeric_limits<double> {
+ typedef double real_t;
+ typedef int64_t int_t;
+ typedef uint64_t uint_t;
+
+ // Ensure the internal representation is what we expect
+ static_assert(is_signed, "real_t is not signed");
+ static_assert(radix==2, "real_t is not binary");
+
+ // Ensure the sizes match
+ static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+ static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+ // Number of bits in internal representation
+ static int const bits = 8 * sizeof(real_t);
+ static int const mantissa_bits = digits - 1;
+ static int const sign_bits = 1;
+ static int const exponent_bits = bits - mantissa_bits - sign_bits;
+ static int const exponent_offset = 2 - min_exponent;
+ static_assert(mantissa_bits + exponent_bits + sign_bits == bits,
+ "error in bit counts");
+ static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+ static uint_t const exponent_mask =
+ ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
+ static uint_t const sign_mask = uint_t(1) << (bits-1);
+ static_assert((mantissa_mask & exponent_mask & sign_mask) == uint_t(0),
+ "error in masks");
+ static_assert((mantissa_mask | exponent_mask | sign_mask) == ~uint_t(0),
+ "error in masks");
+
+ // Re-interpret bit patterns
+ static inline real_t as_float(int_t x) { return *(real_t*)&x; }
+ static inline int_t as_int(real_t x) { return *(int_t*)&x; }
+
+ // Convert values
+ static inline real_t convert_float(int_t x) { return real_t(x); }
+ static inline int_t convert_int(real_t x)
+ {
+ static_assert(sizeof(std::llrint(x)) == sizeof(int_t),
+ "llrint() has wrong return type");
+ return std::llrint(x);
+ }
+ };
+
+ template<>
+ struct intprops<floatprops<double>::int_t> {
+ typedef double real_t;
+ };
+
+
+
+ template<typename int_t>
+ inline typename intprops<int_t>::real_t as_float(int_t x)
+ {
+ typedef typename intprops<int_t>::real_t real_t;
+ return floatprops<real_t>::as_float(x);
+ }
+
+ template<typename real_t>
+ inline typename floatprops<real_t>::int_t as_int(real_t x)
+ {
+ return floatprops<real_t>::as_int(x);
+ }
+
+} // namespace vecmathlib
+
+#endif // #ifndef FLOATPROPS_H
diff --git a/mathfuncs.h b/mathfuncs.h
new file mode 100644
index 0000000..e62358b
--- /dev/null
+++ b/mathfuncs.h
@@ -0,0 +1,15 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_H
+#define MATHFUNCS_H
+
+#include "mathfuncs_base.h"
+
+#include "mathfuncs_asin.h"
+#include "mathfuncs_convert.h"
+#include "mathfuncs_fabs.h"
+#include "mathfuncs_log.h"
+#include "mathfuncs_rcp.h"
+#include "mathfuncs_sqrt.h"
+
+#endif // #ifndef MATHFUNCS_H
diff --git a/mathfuncs_asin.h b/mathfuncs_asin.h
new file mode 100644
index 0000000..a8ffff4
--- /dev/null
+++ b/mathfuncs_asin.h
@@ -0,0 +1,83 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_ASIN_H
+#define MATHFUNCS_ASIN_H
+
+#include "mathfuncs_base.h"
+
+#include <cassert>
+#include <cmath>
+
+
+
+namespace vecmathlib {
+
+ template<typename realvec_t>
+ realvec_t mathfuncs<realvec_t>::vml_atan(realvec_t x)
+ {
+ // Handle negative values
+ realvec_t x0 = x;
+ x = fabs(x);
+
+ // Reduce range using 1/x identity
+ assert(all(x >= RV(0.0)));
+ boolvec_t gt_one = x > RV(1.0);
+ x = ifthen(gt_one, rcp(x), x);
+
+ // Reduce range again using half-angle formula; see
+ // <https://en.wikipedia.org/wiki/Inverse_trigonometric_functions>.
+ // This is necessary for good convergence below.
+ x = x / (RV(1.0) + sqrt(RV(1.0) + x*x));
+
+ // Taylor expansion; see
+ // <https://en.wikipedia.org/wiki/Inverse_trigonometric_functions>.
+ assert(all(x >= RV(0.0) && x <= RV(0.5)));
+ int const nmax = 30; // ???
+ realvec_t y = x / (RV(1.0) + x*x);
+ realvec_t x2 = x * y;
+ realvec_t r = y;
+ for (int n=3; n<nmax; n+=2) {
+ y *= RV(R(n-1) / R(n)) * x2;
+ r += y;
+ }
+
+ // Undo second range reduction
+ r = RV(2.0) * r;
+
+ // Undo range reduction
+ r = ifthen(gt_one, RV(M_PI_2) - r, r);
+
+ // Handle negative values
+ r = copysign(r, x0);
+
+ return r;
+ }
+
+
+
+ template<typename realvec_t>
+ realvec_t mathfuncs<realvec_t>::vml_acos(realvec_t x)
+ {
+ // Handle negative values
+ boolvec_t is_negative = signbit(x);
+ x = fabs(x);
+
+ realvec_t r = RV(2.0) * atan(sqrt(RV(1.0) - x*x) / (RV(1.0) + x));
+
+ // Handle negative values
+ r = ifthen(is_negative, RV(M_PI) - r, r);
+
+ return r;
+ }
+
+
+
+ template<typename realvec_t>
+ realvec_t mathfuncs<realvec_t>::vml_asin(realvec_t x)
+ {
+ return RV(2.0) * atan(x / (RV(1.0) + sqrt(RV(1.0) - x*x)));
+ }
+
+}; // namespace vecmathlib
+
+#endif // #ifndef MATHFUNCS_ASIN_H
diff --git a/mathfuncs_base.h b/mathfuncs_base.h
new file mode 100644
index 0000000..0b9f04f
--- /dev/null
+++ b/mathfuncs_base.h
@@ -0,0 +1,67 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_BASE_H
+#define MATHFUNCS_BASE_H
+
+#include "floatprops.h"
+#include "vec_base.h"
+
+
+
+namespace vecmathlib {
+
+ template<typename realvec_t>
+ struct mathfuncs {
+ typedef floatprops<typename realvec_t::real_t> FP;
+
+ typedef typename FP::real_t real_t;
+ typedef typename FP::int_t int_t;
+ typedef typename FP::uint_t uint_t;
+
+ static int const size = realvec_t::size;
+
+ // typedef realvec<real_t, size> realvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef boolvec<real_t, size> boolvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+
+ // asin
+ static realvec_t vml_acos(realvec_t x);
+ static realvec_t vml_asin(realvec_t x);
+ static realvec_t vml_atan(realvec_t x);
+
+ // convert
+ static realvec_t vml_convert_float(intvec_t x);
+ static intvec_t vml_convert_int(realvec_t x);
+
+ // fabs
+ static realvec_t vml_copysign(realvec_t x, realvec_t y);
+ static realvec_t vml_fabs(realvec_t x);
+ static intvec_t vml_ilogb(realvec_t x);
+ static realvec_t vml_scalbn(realvec_t x, intvec_t n);
+ static boolvec_t vml_signbit(realvec_t x);
+
+ // log
+ static realvec_t vml_log(realvec_t x);
+ static realvec_t vml_log10(realvec_t x);
+ static realvec_t vml_log1p(realvec_t x);
+ static realvec_t vml_log2(realvec_t x);
+
+ // rcp
+ static realvec_t vml_rcp(realvec_t x);
+
+ // sqrt
+ static realvec_t vml_rsqrt(realvec_t x);
+ static realvec_t vml_sqrt(realvec_t x);
+ };
+
+} // namespace vecmathlib
+
+#endif // #ifndef MATHFUNCS_BASE_H
diff --git a/mathfuncs_convert.h b/mathfuncs_convert.h
new file mode 100644
index 0000000..4dfffef
--- /dev/null
+++ b/mathfuncs_convert.h
@@ -0,0 +1,86 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_CONVERT_H
+#define MATHFUNCS_CONVERT_H
+
+#include "mathfuncs_base.h"
+
+#include <cassert>
+#include <cmath>
+
+
+
+namespace vecmathlib {
+
+ template<typename realvec_t>
+ realvec_t mathfuncs<realvec_t>::vml_convert_float(intvec_t x)
+ {
+ // Convert in two passes. Convert as much as possible during the
+ // first pass (lobits), so that the second pass (hibits) may be
+ // omitted if the high bits are known to be zero.
+ int_t lobits = FP::mantissa_bits;
+ int_t hibits = FP::bits - lobits;
+
+ // Convert lower bits
+ intvec_t xlo = x & IV((U(1) << lobits) - 1);
+ // exponent for the equivalent floating point number
+ int_t exponent_lo = FP::exponent_offset + lobits;
+ xlo |= exponent_lo;
+ // subtract hidden mantissa bit
+ realvec_t flo = as_float(xlo) - RV(FP::as_float(exponent_lo));
+
+ // Convert upper bits
+ // make unsigned by subtracting largest negative number
+ x ^= FP::sign_mask;
+ intvec_t xhi = lsr(x, lobits);
+ // exponent for the equivalent floating point number
+ int_t exponent_hi = FP::exponent_offset + FP::bits;
+ xhi |= exponent_hi;
+ // add largest negative number, and subtract hidden mantissa bit
+ realvec_t
+ fhi = as_float(xhi) - RV(FP::as_float(exponent_hi) + R(FP::sign_mask));
+
+ // Combine results
+ return flo + fhi;
+ }
+
+
+
+ template<typename realvec_t>
+ auto mathfuncs<realvec_t>::vml_convert_int(realvec_t x) -> intvec_t
+ {
+ // Handle zero
+ boolvec_t is_zero = x == RV(0.0);
+ // Handle negative numbers
+ boolvec_t is_negative = signbit(x);
+ x = fabs(x);
+
+ // Round, by adding a large number that removes the excess
+ // precision
+ int_t large = U(1) << FP::mantissa_bits;
+ x += R(large);
+
+ intvec_t exponent = ilogb(x);
+ for (int i=0; i<intvec_t::size; ++i) {
+ assert(exponent[i] >= FP::mantissa_bits);
+ }
+ intvec_t ix = as_int(x) & IV(FP::mantissa_mask);
+ // add hidden mantissa bit
+ ix |= U(1) << FP::mantissa_bits;
+ // shift according to exponent
+ ix <<= exponent - IV(FP::mantissa_bits);
+
+ // Undo the adding above
+ ix -= large;
+
+ // Handle negative numbers
+ ix = ifthen(is_negative, -ix, ix);
+ // Handle zero
+ ix = ifthen(is_zero, IV(0), ix);
+
+ return ix;
+ }
+
+}; // namespace vecmathlib
+
+#endif // #ifndef MATHFUNCS_CONVERT_H
diff --git a/mathfuncs_fabs.h b/mathfuncs_fabs.h
new file mode 100644
index 0000000..034fedb
--- /dev/null
+++ b/mathfuncs_fabs.h
@@ -0,0 +1,52 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_FABS_H
+#define MATHFUNCS_FABS_H
+
+#include "mathfuncs_base.h"
+
+#include <cassert>
+#include <cmath>
+
+
+
+namespace vecmathlib {
+
+ template<typename realvec_t>
+ realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y)
+ {
+ intvec_t value = as_int(x) & IV(~FP::sign_mask);
+ intvec_t sign = as_int(y) & IV(FP::sign_mask);
+ return as_float(sign | value);
+ }
+
+ template<typename realvec_t>
+ realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x)
+ {
+ return as_float(as_int(x) & IV(~FP::sign_mask));
+ }
+
+ template<typename realvec_t>
+ auto mathfuncs<realvec_t>::vml_ilogb(realvec_t x) -> intvec_t
+ {
+ return
+ lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits) -
+ IV(FP::exponent_offset);
+ }
+
+ template<typename realvec_t>
+ realvec_t mathfuncs<realvec_t>::vml_scalbn(realvec_t x, intvec_t n)
+ {
+ return as_float(as_int(x) + (n << FP::mantissa_bits));
+ // return x * as_float((n + exponent_offset) << mantissa_bits);
+ }
+
+ template<typename realvec_t>
+ auto mathfuncs<realvec_t>::vml_signbit(realvec_t x) -> boolvec_t
+ {
+ return convert_bool(as_int(x) & IV(FP::sign_mask));
+ }
+
+}; // namespace vecmathlib
+
+#endif // #ifndef MATHFUNCS_FABS_H
diff --git a/mathfuncs_log.h b/mathfuncs_log.h
new file mode 100644
index 0000000..8fb3204
--- /dev/null
+++ b/mathfuncs_log.h
@@ -0,0 +1,70 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_LOG_H
+#define MATHFUNCS_LOG_H
+
+#include "mathfuncs_base.h"
+
+#include <cassert>
+#include <cmath>
+
+
+
+namespace vecmathlib {
+
+ template<typename realvec_t>
+ realvec_t mathfuncs<realvec_t>::vml_log2(realvec_t x)
+ {
+ // Rescale
+ assert(all(x > RV(0.0)));
+ intvec_t ilogb_x = ilogb(x);
+ x = scalbn(x, -ilogb_x);
+ assert(all(x >= RV(1.0) && x < RV(2.0)));
+
+ // Approximate
+ assert(all(x >= RV(1.0) && x < RV(2.0)));
+ // log(x) = Sum[n=1,nmax,n%2==1] 2/n ((x-1) / (x+1))^n
+ int const nmax = 30;
+ x *= RV(M_SQRT1_2); // shift range to increase accuracy
+ realvec_t xm1_xp1 = (x - RV(1.0)) / (x + RV(1.0));
+ realvec_t xm1_xp1_2 = xm1_xp1 * xm1_xp1;
+ realvec_t y = RV(M_LOG2E * 2.0) * xm1_xp1;
+ realvec_t r = y;
+ for (int n=3; n<nmax; n+=2) {
+ y *= xm1_xp1_2;
+ r += y * RV(R(1.0) / R(n));
+ }
+ r += RV(0.5); // correct result for range shift
+
+ // Undo rescaling
+ r += convert_float(ilogb_x);
+
+ return r;
+ }
+
+
+
+ template<typename realvec_t>
+ inline
+ realvec_t mathfuncs<realvec_t>::vml_log(realvec_t x)
+ {
+ return log2(x) * RV(M_LN2);
+ }
+
+ template<typename realvec_t>
+ inline
+ realvec_t mathfuncs<realvec_t>::vml_log10(realvec_t x)
+ {
+ return log(x) * RV(M_LOG10E);
+ }
+
+ template<typename realvec_t>
+ inline
+ realvec_t mathfuncs<realvec_t>::vml_log1p(realvec_t x)
+ {
+ return log(RV(1.0) + x);
+ }
+
+}; // namespace vecmathlib
+
+#endif // #ifndef MATHFUNCS_LOG_H
diff --git a/mathfuncs_rcp.h b/mathfuncs_rcp.h
new file mode 100644
index 0000000..1704d8f
--- /dev/null
+++ b/mathfuncs_rcp.h
@@ -0,0 +1,48 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_RCP_H
+#define MATHFUNCS_RCP_H
+
+#include "mathfuncs_base.h"
+
+#include <cassert>
+#include <cmath>
+
+
+
+namespace vecmathlib {
+
+ template<typename realvec_t>
+ realvec_t mathfuncs<realvec_t>::vml_rcp(realvec_t x)
+ {
+ // Handle negative values
+ realvec_t x0 = x;
+ x = fabs(x);
+
+ // Initial guess
+ assert(all(x > RV(0.0)));
+ intvec_t ilogb_x = ilogb(x);
+ // For stability, choose a starting value that is below the result
+ realvec_t r = scalbn(RV(0.5), -ilogb_x);
+
+ // Iterate
+ int const nmax = 7;
+ for (int n=1; n<nmax; ++n) {
+ // Step
+ assert(all(x > RV(0.0)));
+ // Newton method:
+ // Solve f(r) = 0 for f(r) = x - 1/r
+ // r <- r - f(r) / f'(r)
+ // r <- 2 r - r^2 x
+ r *= RV(2.0) - r * x;
+ }
+
+ // Handle negative values
+ r = copysign(r, x0);
+
+ return r;
+ }
+
+}; // namespace vecmathlib
+
+#endif // #ifndef MATHFUNCS_RCP_H
diff --git a/mathfuncs_sqrt.h b/mathfuncs_sqrt.h
new file mode 100644
index 0000000..2c43478
--- /dev/null
+++ b/mathfuncs_sqrt.h
@@ -0,0 +1,69 @@
+// -*-C++-*-
+
+#ifndef MATHFUNCS_SQRT_H
+#define MATHFUNCS_SQRT_H
+
+#include "mathfuncs_base.h"
+
+#include <cassert>
+#include <cmath>
+
+
+
+namespace vecmathlib {
+
+ template<typename realvec_t>
+ realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x)
+ {
+ std::cout << "mathfuncs::sqrt\n";
+ // Handle special case: zero
+ boolvec_t is_zero = x <= RV(0.0);
+ x = ifthen(is_zero, RV(1.0), x);
+
+ // Initial guess
+ assert(all(x > RV(0.0)));
+#if 0
+ intvec_t ilogb_x = ilogb(x);
+ realvec_t r = scalbn(RV(M_SQRT2), ilogb_x >> 1);
+ // TODO: divide by M_SQRT2 if ilogb_x % 2 == 1 ?
+#else
+ real_t correction =
+ std::scalbn(4.0/3.0, FP::exponent_offset >> 1) *
+ (FP::exponent_offset & 1 ? 1.0 / M_SQRT1_2 : 1.0);
+ realvec_t r = lsr(x.as_int(), 1).as_float() * RV(correction);
+ real_t one=1.0;
+ std::cout << "x=" << x << " r=" << r << " eo=" << FP::exponent_offset << " corr=" << correction << " ione=" << RV(one).as_int() << "\n";
+#endif
+
+ // Iterate
+ // nmax iterations give an accuracy of 2^nmax binary digits. 5
+ // iterations suffice for double precision with its 53 digits.
+ int const nmax = 5;
+ std::cout << "bits" << FP::bits << "nmax" << nmax << "\n";
+ for (int n=1; n<nmax; ++n) {
+ // Step
+ assert(all(x > RV(0.0)));
+ // Newton method:
+ // Solve f(r) = 0 for f(r) = x - r^2
+ // r <- r - f(r) / f'(r)
+ // r <- (r + x / r) / 2
+ r = RV(0.5) * (r + x / r);
+ }
+
+ // Handle special case: zero
+ r = ifthen(is_zero, RV(0.0), r);
+
+ return r;
+ }
+
+
+
+ template<typename realvec_t>
+ realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x)
+ {
+ return rcp(sqrt(x));
+ }
+
+}; // namespace vecmathlib
+
+#endif // #ifndef MATHFUNCS_SQRT_H
diff --git a/vec_base.h b/vec_base.h
new file mode 100644
index 0000000..ab4a558
--- /dev/null
+++ b/vec_base.h
@@ -0,0 +1,277 @@
+// -*-C++-*-
+
+#ifndef VEC_BASE_H
+#define VEC_BASE_H
+
+#include <iostream>
+
+namespace vecmathlib {
+
+ template<typename real_t, int size>
+ struct boolvec {
+ };
+
+ template<typename real_t, int size>
+ struct intvec {
+ };
+
+ template<typename real_t, int size>
+ struct realvec {
+ };
+
+
+
+ // boolvec wrappers
+
+ template<typename real_t, int size>
+ inline intvec<real_t, size> as_int(boolvec<real_t, size> x)
+ {
+ return x.as_int();
+ }
+
+ template<typename real_t, int size>
+ inline intvec<real_t, size> convert_int(boolvec<real_t, size> x)
+ {
+ return x.convert_int();
+ }
+
+ template<typename real_t, int size>
+ inline bool all(boolvec<real_t, size> x) { return x.all(); }
+
+ template<typename real_t, int size>
+ inline bool any(boolvec<real_t, size> x) { return x.any(); }
+
+ template<typename real_t, int size>
+ inline
+ intvec<real_t, size> ifthen(boolvec<real_t, size> c,
+ intvec<real_t, size> x,
+ intvec<real_t, size> y)
+ {
+ return c.ifthen(x, y);
+ }
+
+ template<typename real_t, int size>
+ inline
+ realvec<real_t, size> ifthen(boolvec<real_t, size> c,
+ realvec<real_t, size> x,
+ realvec<real_t, size> y)
+ {
+ return c.ifthen(x, y);
+ }
+
+
+
+ // intvec wrappers
+
+ template<typename real_t, int size>
+ inline boolvec<real_t, size> as_bool(intvec<real_t, size> x)
+ {
+ return x.as_bool();
+ }
+
+ template<typename real_t, int size>
+ inline boolvec<real_t, size> convert_bool(intvec<real_t, size> x)
+ {
+ return x.convert_bool();
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> as_float(intvec<real_t, size> x)
+ {
+ return x.as_float();
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> convert_float(intvec<real_t, size> x)
+ {
+ return x.convert_float();
+ }
+
+ template<typename real_t, int size>
+ inline intvec<real_t, size> lsr(intvec<real_t, size> x,
+ typename intvec<real_t, size>::int_t n)
+ {
+ return x.lsr(n);
+ }
+
+ template<typename real_t, int size>
+ inline intvec<real_t, size> lsr(intvec<real_t, size> x,
+ intvec<real_t, size> n)
+ {
+ return x.lsr(n);
+ }
+
+
+
+ // realvec wrappers
+
+ template<typename real_t, int size>
+ inline intvec<real_t, size> as_int(realvec<real_t, size> x)
+ {
+ return x.as_int();
+ }
+
+ template<typename real_t, int size>
+ inline intvec<real_t, size> convert_int(realvec<real_t, size> x)
+ {
+ return x.convert_int();
+ }
+
+ template<typename real_t, int size>
+ inline
+ typename realvec<real_t, size>::real_t prod(realvec<real_t, size> x)
+ {
+ return x.prod();
+ }
+
+ template<typename real_t, int size>
+ inline
+ typename realvec<real_t, size>::real_t sum(realvec<real_t, size> x)
+ {
+ return x.sum();
+ }
+
+
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> copysign(realvec<real_t, size> x,
+ realvec<real_t, size> y)
+ {
+ return x.copysign(y);
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> fabs(realvec<real_t, size> x)
+ {
+ return x.fabs();
+ }
+
+ template<typename real_t, int size>
+ inline intvec<real_t, size> ilogb(realvec<real_t, size> x)
+ {
+ return x.ilogb();
+ }
+
+ template<typename real_t, int size>
+ inline
+ realvec<real_t, size> scalbn(realvec<real_t, size> x,
+ intvec<real_t, size> n)
+ {
+ return x.scalbn(n);
+ }
+
+ template<typename real_t, int size>
+ inline boolvec<real_t, size> signbit(realvec<real_t, size> x)
+ {
+ return x.signbit();
+ }
+
+
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> acos(realvec<real_t, size> x)
+ {
+ return x.acos();
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> asin(realvec<real_t, size> x)
+ {
+ return x.asin();
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> atan(realvec<real_t, size> x)
+ {
+ return x.atan();
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> floor(realvec<real_t, size> x)
+ {
+ return x.floor();
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> log(realvec<real_t, size> x)
+ {
+ return x.log();
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> log10(realvec<real_t, size> x)
+ {
+ return x.log10();
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> log1p(realvec<real_t, size> x)
+ {
+ return x.log1p();
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> log2(realvec<real_t, size> x)
+ {
+ return x.log2();
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> rcp(realvec<real_t, size> x)
+ {
+ return x.rcp();
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> rsqrt(realvec<real_t, size> x)
+ {
+ return x.rsqrt();
+ }
+
+ template<typename real_t, int size>
+ inline realvec<real_t, size> sqrt(realvec<real_t, size> x)
+ {
+ return x.sqrt();
+ }
+
+
+
+ template<typename real_t, int size>
+ std::ostream& operator<<(std::ostream& os, boolvec<real_t, size> const& x)
+ {
+ os << "[";
+ for (int i=0; i<size; ++i) {
+ if (i!=0) os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+ }
+
+ template<typename real_t, int size>
+ std::ostream& operator<<(std::ostream& os, intvec<real_t, size> const& x)
+ {
+ os << "[";
+ for (int i=0; i<size; ++i) {
+ if (i!=0) os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+ }
+
+ template<typename real_t, int size>
+ std::ostream& operator<<(std::ostream& os, realvec<real_t, size> const& x)
+ {
+ os << "[";
+ for (int i=0; i<size; ++i) {
+ if (i!=0) os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+ }
+
+} // namespace vecmathlib
+
+#endif // #ifndef VEC_BASE_H
diff --git a/vec_double_avx.h b/vec_double_avx.h
new file mode 100644
index 0000000..258ffd0
--- /dev/null
+++ b/vec_double_avx.h
@@ -0,0 +1,588 @@
+// -*-C++-*-
+
+#ifndef VEC_DOUBLE_AVX_H
+#define VEC_DOUBLE_AVX_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// AVX intrinsics
+#include <immintrin.h>
+
+
+
+namespace vecmathlib {
+
+ template<> struct boolvec<double,4>;
+ template<> struct intvec<double,4>;
+ template<> struct realvec<double,4>;
+
+
+
+ template<>
+ struct boolvec<double,4>: floatprops<double>
+ {
+ static const int size = 4;
+ typedef bool scalar_t;
+ typedef __m256d bvector_t;
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+ private:
+ // true values have the sign bit set, false values have it unset
+ static uint_t from_bool(bool a) { return - uint_t(a); }
+ static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+ public:
+
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+
+
+ bvector_t v;
+
+ boolvec() {}
+ boolvec(boolvec const& x): v(x.v) {}
+ boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x): v(x) {}
+ boolvec(bool a):
+ v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {}
+ boolvec(bool const* as):
+ v(_mm256_castsi256_pd(_mm256_set_epi64x(from_bool(as[3]),
+ from_bool(as[2]),
+ from_bool(as[1]),
+ from_bool(as[0])))) {}
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+ boolvec& set_elt(int n, bool a)
+ {
+ return ((int_t*)&v)[n] = from_bool(a), *this;
+ }
+
+
+
+ auto as_int() const -> intvec_t; // defined after intvec
+ auto convert_int() const -> intvec_t; // defined after intvec
+
+
+
+ boolvec operator!() const { return _mm256_xor_pd(boolvec(true), v); }
+
+ boolvec operator&&(boolvec x) const { return _mm256_and_pd(v, x.v); }
+ boolvec operator||(boolvec x) const { return _mm256_or_pd(v, x.v); }
+ boolvec operator==(boolvec x) const { return !(*this==x); }
+ boolvec operator!=(boolvec x) const { return _mm256_xor_pd(v, x.v); }
+
+ bool all() const
+ {
+ return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+ }
+ bool any() const
+ {
+ return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+ }
+
+
+
+ // ifthen(condition, then-value, else-value)
+ auto ifthen(intvec_t x,
+ intvec_t y) const -> intvec_t; // defined after intvec
+ auto ifthen(realvec_t x,
+ realvec_t y) const -> realvec_t; // defined after realvec
+
+ };
+
+
+
+ template<>
+ struct intvec<double,4>: floatprops<double>
+ {
+ static const int size = 4;
+ typedef int_t scalar_t;
+ typedef __m256i ivector_t;
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+
+
+ ivector_t v;
+
+ intvec() {}
+ intvec(intvec const& x): v(x.v) {}
+ intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x): v(x) {}
+ intvec(int_t a): v(_mm256_set1_epi64x(a)) {}
+ intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {}
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+ intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+
+
+
+ auto as_bool() const -> boolvec_t { return _mm256_castsi256_pd(v); }
+ auto convert_bool() const -> boolvec_t
+ {
+ // Result: convert_bool(0)=false, convert_bool(else)=true
+ // There is no intrinsic to compare with zero. Instead, we check
+ // whether x is positive and x-1 is negative.
+ intvec x = *this;
+ intvec xm1 = x - 1;
+ // We know that boolvec values depend only on the sign bit
+ // return (~xm1 | x).as_bool();
+ return x.as_bool() || !xm1.as_bool();
+ }
+ auto as_float() const -> realvec_t; // defined after realvec
+ auto convert_float() const -> realvec_t; // defined after realvec
+
+
+
+ // Note: not all arithmetic operations are supported!
+
+ intvec operator+() const { return *this; }
+ intvec operator-() const
+ {
+ __m128i v01 = _mm256_castsi256_si128(v);
+ __m128i v23 = _mm256_extractf128_si256(v, 1);
+ v01 = _mm_sub_epi64(_mm_set1_epi64x(0), v01);
+ v23 = _mm_sub_epi64(_mm_set1_epi64x(0), v23);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+ }
+
+ intvec operator+(intvec x) const
+ {
+ __m128i v01 = _mm256_castsi256_si128(v);
+ __m128i v23 = _mm256_extractf128_si256(v, 1);
+ __m128i xv01 = _mm256_castsi256_si128(x.v);
+ __m128i xv23 = _mm256_extractf128_si256(x.v, 1);
+ v01 = _mm_add_epi64(v01, xv01);
+ v23 = _mm_add_epi64(v23, xv23);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+ }
+ intvec operator-(intvec x) const
+ {
+ __m128i v01 = _mm256_castsi256_si128(v);
+ __m128i v23 = _mm256_extractf128_si256(v, 1);
+ __m128i xv01 = _mm256_castsi256_si128(x.v);
+ __m128i xv23 = _mm256_extractf128_si256(x.v, 1);
+ v01 = _mm_sub_epi64(v01, xv01);
+ v23 = _mm_sub_epi64(v23, xv23);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+ }
+
+ intvec& operator+=(intvec const& x) { return *this=*this+x; }
+ intvec& operator-=(intvec const& x) { return *this=*this-x; }
+
+
+
+ intvec operator~() const
+ {
+ return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(IV(~U(0))),
+ _mm256_castsi256_pd(v)));
+ }
+
+ intvec operator&(intvec x) const
+ {
+ return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(v),
+ _mm256_castsi256_pd(x.v)));
+ }
+ intvec operator|(intvec x) const
+ {
+ return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(v),
+ _mm256_castsi256_pd(x.v)));
+ }
+ intvec operator^(intvec x) const
+ {
+ return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(v),
+ _mm256_castsi256_pd(x.v)));
+ }
+
+ intvec& operator&=(intvec const& x) { return *this=*this&x; }
+ intvec& operator|=(intvec const& x) { return *this=*this|x; }
+ intvec& operator^=(intvec const& x) { return *this=*this^x; }
+
+
+
+ // SSE2 shift instructions potentially relevant for 64-bit
+ // operations:
+ // _mm_slli_epi64
+ // _mm_srli_epi64
+ // _mm_sll_epi64
+ // _mm_srl_epi64
+ // _mm_srai_epi32
+ // _mm_sra_epi32
+ // _mm_srli_si128
+ // _mm_slli_si128
+ intvec lsr(int_t n) const
+ {
+ __m128i v01 = _mm256_castsi256_si128(v);
+ __m128i v23 = _mm256_extractf128_si256(v, 1);
+ v01 = _mm_srli_epi64(v01, n);
+ v23 = _mm_srli_epi64(v23, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+ }
+ intvec operator>>(int_t n) const
+ {
+ __m128i v01 = _mm256_castsi256_si128(v);
+ __m128i v23 = _mm256_extractf128_si256(v, 1);
+ // There is no _mm_srai_epi64. To emulate it, add 0x8000000
+ // before shifting, and subtracted the shifted 0x80000000 after shifting
+#if 0
+ __m128i signmask01 = _mm_sub_epi64(_mm_set1_epi64x(0),
+ _mm_srli_epi64(v01, 63));
+ __m128i signmask23 = _mm_sub_epi64(_mm_set1_epi64x(0),
+ _mm_srli_epi64(v23, 63));
+ v01 = _mm_xor_si128(signmask01, v01);
+ v23 = _mm_xor_si128(signmask23, v23);
+ v01 = _mm_srli_epi64(v01, n);
+ v23 = _mm_srli_epi64(v23, n);
+ v01 = _mm_xor_si128(signmask01, v01);
+ v23 = _mm_xor_si128(signmask23, v23);
+#else
+ // Convert signed to unsiged
+ v01 += _mm_set1_epi64x(U(1) << (bits-1));
+ v23 += _mm_set1_epi64x(U(1) << (bits-1));
+ // Shift
+ v01 = _mm_srli_epi64(v01, n);
+ v23 = _mm_srli_epi64(v23, n);
+ // Undo conversion
+ v01 -= _mm_set1_epi64x(U(1) << (bits-n));
+ v23 -= _mm_set1_epi64x(U(1) << (bits-n));
+#endif
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+ }
+ intvec operator<<(int_t n) const
+ {
+ __m128i v01 = _mm256_castsi256_si128(v);
+ __m128i v23 = _mm256_extractf128_si256(v, 1);
+ v01 = _mm_slli_epi64(v01, n);
+ v23 = _mm_slli_epi64(v23, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+ }
+ intvec& operator>>=(int_t n) { return *this=*this>>n; }
+ intvec& operator<<=(int_t n) { return *this=*this<<n; }
+
+ intvec lsr(intvec n) const
+ {
+ __m128i v01 = _mm256_castsi256_si128(v);
+ __m128i v23 = _mm256_extractf128_si256(v, 1);
+ __m128i nv01 = _mm256_castsi256_si128(n.v);
+ __m128i nv23 = _mm256_extractf128_si256(n.v, 1);
+ v01 = _mm_srl_epi64(v01, nv01);
+ v23 = _mm_srl_epi64(v23, nv23);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+ }
+ intvec operator>>(intvec n) const
+ {
+ __m128i v01 = _mm256_castsi256_si128(v);
+ __m128i v23 = _mm256_extractf128_si256(v, 1);
+ __m128i nv01 = _mm256_castsi256_si128(n.v);
+ __m128i nv23 = _mm256_extractf128_si256(n.v, 1);
+#if 0
+ // There is no _mm_srai_epi64. To emulate it, invert all bits
+ // before and after shifting if the sign bit is set.
+ __m128i signmask01 = _mm_sub_epi64(_mm_set1_epi64x(0),
+ _mm_srli_epi64(v01, 63));
+ __m128i signmask23 = _mm_sub_epi64(_mm_set1_epi64x(0),
+ _mm_srli_epi64(v23, 63));
+ v01 = _mm_xor_si128(signmask01, v01);
+ v23 = _mm_xor_si128(signmask23, v23);
+ v01 = _mm_srl_epi64(v01, nv01);
+ v23 = _mm_srl_epi64(v23, nv23);
+ v01 = _mm_xor_si128(signmask01, v01);
+ v23 = _mm_xor_si128(signmask23, v23);
+#else
+ // Convert signed to unsiged
+ v01 += _mm_set1_epi64x(U(1) << (bits-1));
+ v23 += _mm_set1_epi64x(U(1) << (bits-1));
+ // Shift
+ v01 = _mm_srl_epi64(v01, nv01);
+ v23 = _mm_srl_epi64(v23, nv23);
+ // Undo conversion
+ v01 -= _mm_sll_epi64(_mm_set1_epi64x(1),
+ _mm_sub_epi64(_mm_set1_epi64x(bits), nv01));
+ v23 -= _mm_sll_epi64(_mm_set1_epi64x(1),
+ _mm_sub_epi64(_mm_set1_epi64x(bits), nv23));
+#endif
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+ }
+ intvec operator<<(intvec n) const
+ {
+ __m128i v01 = _mm256_castsi256_si128(v);
+ __m128i v23 = _mm256_extractf128_si256(v, 1);
+ __m128i nv01 = _mm256_castsi256_si128(n.v);
+ __m128i nv23 = _mm256_extractf128_si256(n.v, 1);
+ v01 = _mm_sll_epi64(v01, nv01);
+ v23 = _mm_sll_epi64(v23, nv23);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(v01), v23, 1);
+ }
+ intvec& operator>>=(intvec n) { return *this=*this>>n; }
+ intvec& operator<<=(intvec n) { return *this=*this<<n; }
+ };
+
+
+
+ template<>
+ struct realvec<double,4>: floatprops<double>
+ {
+ static const int size = 4;
+ typedef real_t scalar_t;
+ typedef __m256d vector_t;
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+
+
+ vector_t v;
+
+ realvec() {}
+ realvec(realvec const& x): v(x.v) {}
+ realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x): v(x) {}
+ realvec(real_t a): v(_mm256_set1_pd(a)) {}
+ realvec(real_t const* as): v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {}
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+ realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+
+
+
+ intvec_t as_int() const { return _mm256_castpd_si256(v); }
+ intvec_t convert_int() const
+ {
+#if 0
+ __m128i iv0123 = _mm256_cvtpd_epi32(v);
+ __m128i iv2301 = _mm_shuffle_ps(iv0123, iv0123, 0b10110001);
+ __m256i iv01232301 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(iv0123), iv2301, 1);
+ __m256i zero = _mm256_setzero_ps();
+ return _mm256_unpacklo_ps(iv01232301, zero);
+#else
+ realvec x = _mm256_floor_pd(v);
+ intvec_t ix = x.as_int();
+ boolvec_t sign = ix.as_bool();
+ intvec_t exponent = (ix & exponent_mask) >> mantissa_bits;
+ ix &= mantissa_mask;
+ ix |= U(1) << mantissa_bits; // add hidden bit
+ ix <<= exponent - exponent_offset + 52; // ???
+ ix = ifthen(sign, -ix, ix);
+ return ix;
+#endif
+ }
+
+
+
+ realvec operator+() const { return *this; }
+ realvec operator-() const { return _mm256_sub_pd(_mm256_set1_pd(0.0), v); }
+
+ realvec operator+(realvec x) const { return _mm256_add_pd(v, x.v); }
+ realvec operator-(realvec x) const { return _mm256_sub_pd(v, x.v); }
+ realvec operator*(realvec x) const { return _mm256_mul_pd(v, x.v); }
+ realvec operator/(realvec x) const { return _mm256_div_pd(v, x.v); }
+
+ realvec& operator+=(realvec const& x) { return *this=*this+x; }
+ realvec& operator-=(realvec const& x) { return *this=*this-x; }
+ realvec& operator*=(realvec const& x) { return *this=*this*x; }
+ realvec& operator/=(realvec const& x) { return *this=*this/x; }
+
+ real_t prod() const
+ {
+ return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+ }
+ real_t sum() const
+ {
+ return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+ }
+
+
+
+ boolvec_t operator==(realvec const& x) const
+ {
+ return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ);
+ }
+ boolvec_t operator!=(realvec const& x) const
+ {
+ return _mm256_cmp_pd(v, x.v, _CMP_NEQ_OQ);
+ }
+ boolvec_t operator<(realvec const& x) const
+ {
+ return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ);
+ }
+ boolvec_t operator<=(realvec const& x) const
+ {
+ return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ);
+ }
+ boolvec_t operator>(realvec const& x) const
+ {
+ return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ);
+ }
+ boolvec_t operator>=(realvec const& x) const
+ {
+ return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ);
+ }
+
+
+
+#if 0
+ realvec copysign(realvec y) const
+ {
+ uint_t signmask = U(1) << (bits-1);
+ intvec_t value = as_int() & IV(~signmask);
+ intvec_t sign = y.as_int() & IV(signmask);
+ return (sign | value).as_float();
+ }
+
+ realvec fabs() const
+ {
+ uint_t signmask = U(1) << (bits-1);
+ return (as_int() & IV(~signmask)).as_float();
+ }
+
+ intvec_t ilogb() const
+ {
+ intvec_t exponent_mask =
+ ((U(1) << exponent_bits) - U(1)) << mantissa_bits;
+ return lsr(as_int() & exponent_mask, mantissa_bits) - IV(exponent_offset);
+ }
+
+ realvec scalbn(intvec_t n) const
+ {
+ return *this * ((n + exponent_offset) << mantissa_bits).as_float();
+ }
+
+ boolvec_t signbit() const
+ {
+ return v;
+ }
+#endif
+
+ realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+ realvec fabs() const { return MF::vml_fabs(*this); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ realvec scalbn(intvec_t n) const { return MF::vml_scalbn(*this, n); }
+ boolvec_t signbit() const { return v; }
+
+
+
+ realvec acos() const { return MF::vml_acos(*this); }
+ realvec asin() const { return MF::vml_asin(*this); }
+ realvec atan() const { return MF::vml_atan(*this); }
+ realvec floor() const { return _mm256_floor_pd(v); }
+ realvec log() const { return MF::vml_log(*this); }
+ realvec log10() const { return MF::vml_log10(*this); }
+ realvec log1p() const { return MF::vml_log1p(*this); }
+ realvec log2() const { return MF::vml_log2(*this); }
+ realvec rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); }
+ // realvec rcp() const { return MF::vml_rcp(*this); }
+ realvec rsqrt() const { return MF::vml_rsqrt(*this); }
+ realvec sqrt() const { return _mm256_sqrt_pd(v); }
+ // realvec sqrt() const { return MF::vml_sqrt(*this); }
+ };
+
+
+
+ // boolvec definitions
+
+ inline
+ auto boolvec<double,4>::as_int() const -> intvec_t
+ {
+ return _mm256_castpd_si256(v);
+ }
+
+ inline
+ auto boolvec<double,4>::convert_int() const -> intvec_t
+ {
+ //return ifthen(v, U(1), U(0));
+ return lsr(as_int(), bits-1);
+ }
+
+ inline
+ auto boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+ {
+ return ifthen(x.as_float(), y.as_float()).as_int();
+ }
+
+ inline
+ auto boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+ {
+ return _mm256_blendv_pd(y.v, x.v, v);
+ }
+
+
+
+ // intvec definitions
+
+ inline auto intvec<double,4>::as_float() const -> realvec_t
+ {
+ return _mm256_castsi256_pd(v);
+ }
+
+ inline auto intvec<double,4>::convert_float() const -> realvec_t
+ {
+ intvec x = v;
+ uint_t signbitmask = U(1) << (bits-1);
+ // make unsigned
+ x += signbitmask;
+ // convert lower 52 bits
+ intvec xlo = x & IV((U(1) << mantissa_bits) - 1);
+ int_t exponent_0 = exponent_offset + mantissa_bits;
+ xlo = xlo | exponent_0;
+ realvec_t flo = xlo.as_float() - FP::as_float(exponent_0);
+ // convert upper 22 bits
+ intvec xhi = x.lsr(U(mantissa_bits));
+ int_t exponent_52 = exponent_0 + mantissa_bits;
+ xhi = xhi | exponent_52;
+ realvec_t fhi = xhi.as_float() - FP::as_float(exponent_52);
+ return flo + fhi - R(signbitmask);
+ }
+
+} // namespace vecmathlib
+
+#endif // #ifndef VEC_DOUBLE_AVX_H
diff --git a/vec_float.h b/vec_float.h
new file mode 100644
index 0000000..6025ab7
--- /dev/null
+++ b/vec_float.h
@@ -0,0 +1,305 @@
+// -*-C++-*-
+
+#ifndef VEC_FLOAT_H
+#define VEC_FLOAT_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+
+
+namespace vecmathlib {
+
+ template<> struct boolvec<float,1>;
+ template<> struct intvec<float,1>;
+ template<> struct realvec<float,1>;
+
+
+
+ template<>
+ struct boolvec<float,1>: floatprops<float>
+ {
+ static const int size = 1;
+ typedef bool scalar_t;
+ typedef bool bvector_t;
+
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+
+
+ bvector_t v;
+
+ boolvec() {}
+ boolvec(boolvec const& x): v(x.v) {}
+ boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ //boolvec(vector_t x): v(x) {}
+ boolvec(bool a): v(a) {}
+ boolvec(bool const* as): v(as[0]) {}
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const { return v; }
+ boolvec& set_elt(int n, bool a) { return v=a, *this; }
+
+
+
+ auto as_int() const -> intvec_t; // defined after intvec
+ auto convert_int() const -> intvec_t; // defined after intvec
+
+
+
+ boolvec operator!() const { return !v; }
+
+ boolvec operator&&(boolvec x) const { return v&&x.v; }
+ boolvec operator||(boolvec x) const { return v||x.v; }
+ boolvec operator==(boolvec x) const { return v==x.v; }
+ boolvec operator!=(boolvec x) const { return v!=x.v; }
+
+ bool all() const { return v; }
+ bool any() const { return v; }
+
+
+
+ // ifthen(condition, then-value, else-value)
+ auto ifthen(intvec_t x,
+ intvec_t y) const -> intvec_t; // defined after intvec
+ auto ifthen(realvec_t x,
+ realvec_t y) const -> realvec_t; // defined after realvec
+
+ };
+
+
+
+ template<>
+ struct intvec<float,1>: floatprops<float>
+ {
+ static const int size = 1;
+ typedef int_t scalar_t;
+ typedef int_t ivector_t;
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+
+
+ ivector_t v;
+
+ intvec() {}
+ intvec(intvec const& x): v(x.v) {}
+ intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ //intvec(vector_t x): v(x) {}
+ intvec(int_t a): v(a) {}
+ intvec(int_t const* as): v(as[0]) {}
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const { return v; }
+ intvec& set_elt(int n, int_t a) { return v=a, *this; }
+
+
+
+ auto as_bool() const -> boolvec_t { return *(bool const*)&v; }
+ auto convert_bool() const -> boolvec_t { return v; }
+ auto as_float() const -> realvec_t; // defined after realvec
+ auto convert_float() const -> realvec_t; // defined after realvec
+
+
+
+ // Note: not all arithmetic operations are supported!
+
+ intvec operator+() const { return +v; }
+ intvec operator-() const { return -v; }
+
+ intvec operator+(intvec x) const { return v+x.v; }
+ intvec operator-(intvec x) const { return v-x.v; }
+
+ intvec& operator+=(intvec const& x) { return *this=*this+x; }
+ intvec& operator-=(intvec const& x) { return *this=*this-x; }
+
+
+
+ intvec operator~() const { return ~v; }
+
+ intvec operator&(intvec x) const { return v&x.v; }
+ intvec operator|(intvec x) const { return v|x.v; }
+ intvec operator^(intvec x) const { return v^x.v; }
+
+ intvec& operator&=(intvec const& x) { return *this=*this&x; }
+ intvec& operator|=(intvec const& x) { return *this=*this|x; }
+ intvec& operator^=(intvec const& x) { return *this=*this^x; }
+
+
+
+ intvec lsr(int_t n) const { return U(v)>>n; }
+ intvec operator>>(int_t n) const { return v>>n; }
+ intvec operator<<(int_t n) const { return v<<n; }
+ intvec& operator>>=(int_t n) { return *this=*this>>n; }
+ intvec& operator<<=(int_t n) { return *this=*this<<n; }
+
+ intvec lsr(intvec n) const { return U(v)>>n; }
+ intvec operator>>(intvec n) const { return v>>n; }
+ intvec operator<<(intvec n) const { return v<<n; }
+ intvec& operator>>=(intvec n) { return *this=*this>>n; }
+ intvec& operator<<=(intvec n) { return *this=*this<<n; }
+ };
+
+
+
+ template<>
+ struct realvec<float,1>: floatprops<float>
+ {
+ static int const size = 1;
+ typedef real_t scalar_t;
+ typedef real_t vector_t;
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+
+
+ vector_t v;
+
+ realvec() {}
+ realvec(realvec const& x): v(x.v) {}
+ realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ //realvec(vector_t x): v(x) {}
+ realvec(real_t a): v(a) {}
+ realvec(real_t const* as): v(as[0]) {}
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const { return v; }
+ realvec& set_elt(int n, real_t a) { return v=a, *this; }
+
+
+
+ intvec_t as_int() const { return FP::as_int(v); }
+ intvec_t convert_int() const { return MF::vml_convert_int(v); }
+
+
+
+ realvec operator+() const { return +v; }
+ realvec operator-() const { return -v; }
+
+ realvec operator+(realvec x) const { return v+x.v; }
+ realvec operator-(realvec x) const { return v-x.v; }
+ realvec operator*(realvec x) const { return v*x.v; }
+ realvec operator/(realvec x) const { return v/x.v; }
+
+ realvec& operator+=(realvec const& x) { return *this=*this+x; }
+ realvec& operator-=(realvec const& x) { return *this=*this-x; }
+ realvec& operator*=(realvec const& x) { return *this=*this*x; }
+ realvec& operator/=(realvec const& x) { return *this=*this/x; }
+
+ real_t prod() const { return v; }
+ real_t sum() const { return v; }
+
+
+
+ boolvec_t operator==(realvec const& x) const { return v==x.v; }
+ boolvec_t operator!=(realvec const& x) const { return v!=x.v; }
+ boolvec_t operator<(realvec const& x) const { return v<x.v; }
+ boolvec_t operator<=(realvec const& x) const { return v<=x.v; }
+ boolvec_t operator>(realvec const& x) const { return v>x.v; }
+ boolvec_t operator>=(realvec const& x) const { return v>=x.v; }
+
+
+
+ realvec copysign(realvec x) const { return MF::vml_copysign(v, x.v); }
+ realvec fabs() const { return MF::vml_fabs(v); }
+ intvec_t ilogb() const { return MF::vml_ilogb(v); }
+ realvec scalbn(intvec_t n) const { return MF::vml_scalbn(v, n.v); }
+ boolvec_t signbit() const { return MF::vml_signbit(v); }
+
+ realvec acos() const { return MF::vml_acos(*this); }
+ realvec asin() const { return MF::vml_asin(*this); }
+ realvec atan() const { return MF::vml_atan(*this); }
+ realvec floor() const { return std::floor(v); }
+ realvec log() const { return MF::vml_log(*this); }
+ realvec log10() const { return MF::vml_log10(*this); }
+ realvec log1p() const { return MF::vml_log1p(*this); }
+ realvec log2() const { return MF::vml_log2(*this); }
+ realvec rcp() const { return MF::vml_rcp(*this); }
+ realvec rsqrt() const { return MF::vml_rsqrt(*this); }
+ realvec sqrt() const { return MF::vml_sqrt(*this); }
+ };
+
+
+
+ // boolvec definitions
+
+ inline
+ auto boolvec<float,1>::as_int() const -> intvec_t
+ {
+ return v;
+ }
+
+ inline
+ auto boolvec<float,1>::convert_int() const -> intvec_t
+ {
+ return v;
+ }
+
+ inline
+ auto boolvec<float,1>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+ {
+ return v ? x.v : y.v;
+ }
+
+ inline
+ auto boolvec<float,1>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+ {
+ return v ? x.v : y.v;
+ }
+
+
+
+ // intvec definitions
+
+ inline auto intvec<float,1>::as_float() const -> realvec_t
+ {
+ return FP::as_float(v);
+ }
+
+ inline auto intvec<float,1>::convert_float() const -> realvec_t
+ {
+ return MF::vml_convert_float(v);
+ }
+
+} // namespace vecmathlib
+
+#endif // #ifndef VEC_FLOAT_H
diff --git a/vecmathlib.h b/vecmathlib.h
new file mode 100644
index 0000000..2264545
--- /dev/null
+++ b/vecmathlib.h
@@ -0,0 +1,11 @@
+// -*-C++-*-
+
+#ifndef VECMATHLIB_H
+#define VECMATHLIB_H
+
+#include "vec_float.h"
+#if defined __AVX__ // Intel AVX
+# include "vec_double_avx.h"
+#endif
+
+#endif // #ifndef VECMATHLIB_H
OpenPOWER on IntegriCloud