summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--bench.cc274
-rw-r--r--example.cc14
-rw-r--r--example_float.cc14
-rw-r--r--floatbuiltins.h464
-rw-r--r--floatprops.h571
-rw-r--r--floattypes.h316
-rw-r--r--instantiations.cc239
-rw-r--r--interp.cc26
-rw-r--r--loop.cc250
-rw-r--r--mathfuncs.h2
-rw-r--r--mathfuncs_asin.h363
-rw-r--r--mathfuncs_asinh.h53
-rw-r--r--mathfuncs_base.h243
-rw-r--r--mathfuncs_convert.h358
-rw-r--r--mathfuncs_exp.h237
-rw-r--r--mathfuncs_fabs.h305
-rw-r--r--mathfuncs_int.h243
-rw-r--r--mathfuncs_log.h137
-rw-r--r--mathfuncs_pow.h43
-rw-r--r--mathfuncs_rcp.h117
-rw-r--r--mathfuncs_sin.h419
-rw-r--r--mathfuncs_sinh.h37
-rw-r--r--mathfuncs_sqrt.h124
-rw-r--r--selftest.cc1741
-rw-r--r--vec_altivec_float4.h1139
-rw-r--r--vec_avx_double4.h1359
-rw-r--r--vec_avx_float8.h1395
-rw-r--r--vec_avx_fp16_16.h1171
-rw-r--r--vec_avx_fp8_32.h1249
-rw-r--r--vec_base.h1179
-rw-r--r--vec_builtin.h2619
-rw-r--r--vec_mask.h121
-rw-r--r--vec_mic_double8.h1236
-rw-r--r--vec_neon_float2.h1085
-rw-r--r--vec_neon_float4.h1131
-rw-r--r--vec_pseudo.h3052
-rw-r--r--vec_qpx_double4.h1399
-rw-r--r--vec_sse_double1.h1002
-rw-r--r--vec_sse_double2.h1237
-rw-r--r--vec_sse_float1.h998
-rw-r--r--vec_sse_float4.h1286
-rw-r--r--vec_test.h2690
-rw-r--r--vec_vsx_double2.h1215
-rw-r--r--vecmathlib.h248
44 files changed, 15431 insertions, 17970 deletions
diff --git a/bench.cc b/bench.cc
index e795985..ac3eb46 100644
--- a/bench.cc
+++ b/bench.cc
@@ -16,47 +16,38 @@
using namespace std;
using namespace vecmathlib;
-
-
#ifndef __has_builtin
-# define __has_builtin(x) 0 // Compatibility with non-clang compilers
+#define __has_builtin(x) 0 // Compatibility with non-clang compilers
#endif
-
-
typedef unsigned long long ticks;
-inline ticks getticks()
-{
+inline ticks getticks() {
#if __has_builtin(__builtin_readcyclecounter)
return __builtin_readcyclecounter();
#elif defined __x86_64__
ticks a, d;
- asm volatile("rdtsc" : "=a" (a), "=d" (d));
+ asm volatile("rdtsc" : "=a"(a), "=d"(d));
return a | (d << 32);
#elif defined __powerpc__
unsigned int tbl, tbu, tbu1;
do {
- asm volatile("mftbu %0": "=r"(tbu));
- asm volatile("mftb %0": "=r"(tbl));
- asm volatile("mftbu %0": "=r"(tbu1));
+ asm volatile("mftbu %0" : "=r"(tbu));
+ asm volatile("mftb %0" : "=r"(tbl));
+ asm volatile("mftbu %0" : "=r"(tbu1));
} while (tbu != tbu1);
return ((unsigned long long)tbu << 32) | tbl;
#else
timeval tv;
gettimeofday(&tv, NULL);
return 1000000ULL * tv.tv_sec + tv.tv_usec;
- // timespec ts;
- // clock_gettime(CLOCK_REALTIME, &ts);
- // return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
+// timespec ts;
+// clock_gettime(CLOCK_REALTIME, &ts);
+// return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
#endif
}
-inline double elapsed(ticks t1, ticks t0)
-{
- return t1-t0;
-}
+inline double elapsed(ticks t1, ticks t0) { return t1 - t0; }
-double get_sys_time()
-{
+double get_sys_time() {
timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + 1.0e-6 * tv.tv_usec;
@@ -65,8 +56,7 @@ double get_sys_time()
// return ts.tv_sec + 1.0e-9 * ts.tv_nsec;
}
-double measure_tick()
-{
+double measure_tick() {
ticks const rstart = getticks();
double const wstart = get_sys_time();
while (get_sys_time() - wstart < 0.1) {
@@ -74,124 +64,103 @@ double measure_tick()
}
ticks const rend = getticks();
double const wend = get_sys_time();
- assert(wend-wstart >= 0.09);
+ assert(wend - wstart >= 0.09);
return (wend - wstart) / elapsed(rend, rstart);
}
-
-
double global_result = 0.0;
-template<typename realvec_t>
-void save_result(realvec_t result)
-{
- for (int i=0; i<realvec_t::size; ++i) {
+template <typename realvec_t> void save_result(realvec_t result) {
+ for (int i = 0; i < realvec_t::size; ++i) {
global_result += result[i];
}
// Check global accumulator to prevent optimisation
- if (! vml_std::isfinite(global_result)) {
+ if (!vml_std::isfinite(global_result)) {
cout << "\n"
<< "WARNING: Global accumulator is not finite\n";
}
}
+template <typename T> inline T nop(T x) { return x; }
+template <typename T> inline T fneg(T x) { return -x; }
-template<typename T> inline T nop(T x) { return x; }
-
-template<typename T> inline T fneg(T x) { return -x; }
+template <typename T> inline T fadd(T x, T y) { return x + y; }
+template <typename T> inline T fsub(T x, T y) { return x - y; }
+template <typename T> inline T fmul(T x, T y) { return x * y; }
+template <typename T> inline T fdiv(T x, T y) { return x / y; }
-template<typename T> inline T fadd(T x, T y) { return x+y; }
-template<typename T> inline T fsub(T x, T y) { return x-y; }
-template<typename T> inline T fmul(T x, T y) { return x*y; }
-template<typename T> inline T fdiv(T x, T y) { return x/y; }
-
-template<typename T> inline T frexp0(T x)
-{
+template <typename T> inline T frexp0(T x) {
typename T::intvec_t ir;
return frexp(x, &ir);
}
-template<typename T> inline typename T::intvec_t frexp1(T x)
-{
+template <typename T> inline typename T::intvec_t frexp1(T x) {
typename T::intvec_t ir;
frexp(x, &ir);
return ir;
}
-template<typename T> inline T ldexps(T x, T y)
-{
+template <typename T> inline T ldexps(T x, T y) {
typename T::intvec_t iy = convert_int(y);
return ldexp(x, iy[0]);
}
-template<typename T> inline T ldexpv(T x, T y)
-{
+template <typename T> inline T ldexpv(T x, T y) {
typename T::intvec_t iy = convert_int(y);
return ldexp(x, iy);
}
-
-
-#define DECLARE_FUNCTOR(FUNC, XMIN, XMAX) \
- template<typename T> \
- struct functor_##FUNC { \
- static typename T::real_t get_xmin() { return XMIN; } \
- static typename T::real_t get_xmax() { return XMAX; } \
- static const char* name() { return #FUNC; } \
- T operator()(T x) { \
- return FUNC(x); \
- } \
+#define DECLARE_FUNCTOR(FUNC, XMIN, XMAX) \
+ template <typename T> struct functor_##FUNC { \
+ static typename T::real_t get_xmin() { return XMIN; } \
+ static typename T::real_t get_xmax() { return XMAX; } \
+ static const char *name() { return #FUNC; } \
+ T operator()(T x) { return FUNC(x); } \
}
-#define DECLARE_BFUNCTOR(FUNC, XMIN, XMAX) \
- template<typename T> \
- struct functor_##FUNC { \
- static typename T::real_t get_xmin() { return XMIN; } \
- static typename T::real_t get_xmax() { return XMAX; } \
- static const char* name() { return #FUNC; } \
- T operator()(T x) { \
- typename T::boolvec_t res = FUNC(x); \
- return convert_float(convert_int(res)); \
- } \
+#define DECLARE_BFUNCTOR(FUNC, XMIN, XMAX) \
+ template <typename T> struct functor_##FUNC { \
+ static typename T::real_t get_xmin() { return XMIN; } \
+ static typename T::real_t get_xmax() { return XMAX; } \
+ static const char *name() { return #FUNC; } \
+ T operator()(T x) { \
+ typename T::boolvec_t res = FUNC(x); \
+ return convert_float(convert_int(res)); \
+ } \
}
-#define DECLARE_IFUNCTOR(FUNC, XMIN, XMAX) \
- template<typename T> \
- struct functor_##FUNC { \
- static typename T::real_t get_xmin() { return XMIN; } \
- static typename T::real_t get_xmax() { return XMAX; } \
- static const char* name() { return #FUNC; } \
- T operator()(T x) { \
- typename T::intvec_t res = FUNC(x); \
- return convert_float(res); \
- } \
+#define DECLARE_IFUNCTOR(FUNC, XMIN, XMAX) \
+ template <typename T> struct functor_##FUNC { \
+ static typename T::real_t get_xmin() { return XMIN; } \
+ static typename T::real_t get_xmax() { return XMAX; } \
+ static const char *name() { return #FUNC; } \
+ T operator()(T x) { \
+ typename T::intvec_t res = FUNC(x); \
+ return convert_float(res); \
+ } \
}
-#define DECLARE_FUNCTOR2(FUNC, XMIN, XMAX, YOFFSET) \
- template<typename T> \
- struct functor_##FUNC { \
- static typename T::real_t get_xmin() { return XMIN; } \
- static typename T::real_t get_xmax() { return XMAX; } \
- static const char* name() { return #FUNC; } \
- T operator()(T x) { \
- const typename T::real_t yoffset = YOFFSET; \
- return FUNC(x, x + T(yoffset)); \
- } \
+#define DECLARE_FUNCTOR2(FUNC, XMIN, XMAX, YOFFSET) \
+ template <typename T> struct functor_##FUNC { \
+ static typename T::real_t get_xmin() { return XMIN; } \
+ static typename T::real_t get_xmax() { return XMAX; } \
+ static const char *name() { return #FUNC; } \
+ T operator()(T x) { \
+ const typename T::real_t yoffset = YOFFSET; \
+ return FUNC(x, x + T(yoffset)); \
+ } \
}
-#define DECLARE_FUNCTOR3(FUNC, XMIN, XMAX, YOFFSET, ZOFFSET) \
- template<typename T> \
- struct functor_##FUNC { \
- static typename T::real_t get_xmin() { return XMIN; } \
- static typename T::real_t get_xmax() { return XMAX; } \
- static const char* name() { return #FUNC; } \
- T operator()(T x) { \
- const typename T::real_t yoffset = YOFFSET; \
- const typename T::real_t zoffset = ZOFFSET; \
- return FUNC(x, x + T(yoffset), x + T(zoffset)); \
- } \
+#define DECLARE_FUNCTOR3(FUNC, XMIN, XMAX, YOFFSET, ZOFFSET) \
+ template <typename T> struct functor_##FUNC { \
+ static typename T::real_t get_xmin() { return XMIN; } \
+ static typename T::real_t get_xmax() { return XMAX; } \
+ static const char *name() { return #FUNC; } \
+ T operator()(T x) { \
+ const typename T::real_t yoffset = YOFFSET; \
+ const typename T::real_t zoffset = ZOFFSET; \
+ return FUNC(x, x + T(yoffset), x + T(zoffset)); \
+ } \
}
-
-
DECLARE_FUNCTOR(nop, 0.0, 1.0);
DECLARE_FUNCTOR(fneg, 0.0, 1.0);
@@ -252,137 +221,127 @@ DECLARE_FUNCTOR(tan, 0.0, 1.0);
DECLARE_FUNCTOR(tanh, -1.0, +1.0);
DECLARE_FUNCTOR(trunc, -1.0, +1.0);
-
-
-template<typename realvec_t, template<typename> class func_t>
-double run_bench()
-{
+template <typename realvec_t, template <typename> class func_t>
+double run_bench() {
const int numiters = 1000000;
-
+
typedef typename realvec_t::real_t real_t;
const real_t xmin = func_t<realvec_t>::get_xmin();
const real_t xmax = func_t<realvec_t>::get_xmax();
realvec_t x0, dx;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
x0.set_elt(i, xmin + (xmax - xmin) / numiters * i / realvec_t::size);
dx.set_elt(i, (xmax - xmin) / numiters);
}
realvec_t x, y;
ticks t0, t1;
double const cycles_per_tick = 1.0; // measure_tick();
-
+
func_t<realvec_t> func;
t0 = getticks();
x = y = x0;
- for (int n=0; n<numiters; ++n) {
+ for (int n = 0; n < numiters; ++n) {
y += func(x);
x += dx;
}
t1 = getticks();
save_result(y);
-
- return cycles_per_tick * elapsed(t1,t0) * realvec_t::size / numiters;
+
+ return cycles_per_tick * elapsed(t1, t0) * realvec_t::size / numiters;
}
-template<typename realvec_t, template<typename> class func_t>
-void bench_type_func()
-{
- cout << " "
- << setw(-5) << func_t<realvec_t>::name() << " "
- << setw(18) << realvec_t::name() << ": " << flush;
+template <typename realvec_t, template <typename> class func_t>
+void bench_type_func() {
+ cout << " " << setw(-5) << func_t<realvec_t>::name() << " " << setw(18)
+ << realvec_t::name() << ": " << flush;
double const cycles = run_bench<realvec_t, func_t>();
cout << cycles << " cycles\n" << flush;
}
-template<template<typename> class func_t>
-void bench_func()
-{
+template <template <typename> class func_t> void bench_func() {
cout << "\n"
<< "Benchmarking " << func_t<float32_vec>().name() << ":\n";
-
+
// Note: We benchmark neither testvec (since this is known to be
// slow), nor builtinvec (since this has about the same performance
// as pseudovec, and is also not very efficient).
-
- bench_type_func<realpseudovec<float,1>, func_t>();
+
+ bench_type_func<realpseudovec<float, 1>, func_t>();
#ifdef __clang__
- bench_type_func<realbuiltinvec<float,1>, func_t>();
+ bench_type_func<realbuiltinvec<float, 1>, func_t>();
#endif
- bench_type_func<realtestvec<float,1>, func_t>();
+ bench_type_func<realtestvec<float, 1>, func_t>();
#ifdef VECMATHLIB_HAVE_VEC_FLOAT_1
- bench_type_func<realvec<float,1>, func_t>();
+ bench_type_func<realvec<float, 1>, func_t>();
#endif
#ifdef VECMATHLIB_HAVE_VEC_FLOAT_2
- bench_type_func<realpseudovec<float,2>, func_t>();
+ bench_type_func<realpseudovec<float, 2>, func_t>();
#ifdef __clang__
- bench_type_func<realbuiltinvec<float,2>, func_t>();
+ bench_type_func<realbuiltinvec<float, 2>, func_t>();
#endif
// bench_type_func<realtestvec<float,2>, func_t>();
- bench_type_func<realvec<float,2>, func_t>();
+ bench_type_func<realvec<float, 2>, func_t>();
#endif
#ifdef VECMATHLIB_HAVE_VEC_FLOAT_4
- bench_type_func<realpseudovec<float,4>, func_t>();
+ bench_type_func<realpseudovec<float, 4>, func_t>();
#ifdef __clang__
- bench_type_func<realbuiltinvec<float,4>, func_t>();
+ bench_type_func<realbuiltinvec<float, 4>, func_t>();
#endif
// bench_type_func<realtestvec<float,4>, func_t>();
- bench_type_func<realvec<float,4>, func_t>();
+ bench_type_func<realvec<float, 4>, func_t>();
#endif
#ifdef VECMATHLIB_HAVE_VEC_FLOAT_8
- bench_type_func<realpseudovec<float,8>, func_t>();
+ bench_type_func<realpseudovec<float, 8>, func_t>();
#ifdef __clang__
- bench_type_func<realbuiltinvec<float,8>, func_t>();
+ bench_type_func<realbuiltinvec<float, 8>, func_t>();
#endif
// bench_type_func<realtestvec<float,8>, func_t>();
- bench_type_func<realvec<float,8>, func_t>();
+ bench_type_func<realvec<float, 8>, func_t>();
#endif
-
- bench_type_func<realpseudovec<double,1>, func_t>();
+
+ bench_type_func<realpseudovec<double, 1>, func_t>();
#ifdef __clang__
- bench_type_func<realbuiltinvec<double,1>, func_t>();
+ bench_type_func<realbuiltinvec<double, 1>, func_t>();
#endif
- bench_type_func<realtestvec<double,1>, func_t>();
+ bench_type_func<realtestvec<double, 1>, func_t>();
#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_1
- bench_type_func<realvec<double,1>, func_t>();
+ bench_type_func<realvec<double, 1>, func_t>();
#endif
#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_2
- bench_type_func<realpseudovec<double,2>, func_t>();
+ bench_type_func<realpseudovec<double, 2>, func_t>();
#ifdef __clang__
- bench_type_func<realbuiltinvec<double,2>, func_t>();
+ bench_type_func<realbuiltinvec<double, 2>, func_t>();
#endif
// bench_type_func<realtestvec<double,2>, func_t>();
- bench_type_func<realvec<double,2>, func_t>();
+ bench_type_func<realvec<double, 2>, func_t>();
#endif
#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_4
- bench_type_func<realpseudovec<double,4>, func_t>();
+ bench_type_func<realpseudovec<double, 4>, func_t>();
#ifdef __clang__
- bench_type_func<realbuiltinvec<double,4>, func_t>();
+ bench_type_func<realbuiltinvec<double, 4>, func_t>();
#endif
// bench_type_func<realtestvec<double,4>, func_t>();
- bench_type_func<realvec<double,4>, func_t>();
+ bench_type_func<realvec<double, 4>, func_t>();
#endif
#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_8
- bench_type_func<realpseudovec<double,8>, func_t>();
+ bench_type_func<realpseudovec<double, 8>, func_t>();
#ifdef __clang__
- bench_type_func<realbuiltinvec<double,8>, func_t>();
+ bench_type_func<realbuiltinvec<double, 8>, func_t>();
#endif
// bench_type_func<realtestvec<double,8>, func_t>();
- bench_type_func<realvec<double,8>, func_t>();
+ bench_type_func<realvec<double, 8>, func_t>();
#endif
}
-
-
-void bench()
-{
+void bench() {
bench_func<functor_nop>();
-
+
bench_func<functor_fneg>();
bench_func<functor_fadd>();
bench_func<functor_fsub>();
bench_func<functor_fmul>();
bench_func<functor_fdiv>();
-
+
bench_func<functor_acos>();
bench_func<functor_acosh>();
bench_func<functor_asin>();
@@ -436,10 +395,7 @@ void bench()
bench_func<functor_trunc>();
}
-
-
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
cout << "Benchmarking math functions:\n";
bench();
return 0;
diff --git a/example.cc b/example.cc
index c48ef67..427ec02 100644
--- a/example.cc
+++ b/example.cc
@@ -7,20 +7,18 @@
using namespace std;
using namespace vecmathlib;
-
-
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
// Declare a double precision vector with an architecture-dependent
// number of elements
float64_vec x;
// Set each element separately. This is inefficient and should be
// avoided if possible, but we want to demonstrate it here anyway.
- for (int i=0; i<float64_vec::size; ++i) x.set_elt(i, double(i));
+ for (int i = 0; i < float64_vec::size; ++i)
+ x.set_elt(i, double(i));
float64_vec y = x + float64_vec(1.0);
y = sqrt(y);
float64_vec z = log(y);
-
+
// Boolean vectors are closely related to either double or float
// vectors, thus we need to make a distinction
bool64_vec b = x < y;
@@ -29,12 +27,12 @@ int main(int argc, char** argv)
// corresponding to "float64_vec", and there is "int_vec"
// correpsonding to "float_vec".
int64_vec i = convert_int(y);
-
+
cout << "x=" << x << "\n";
cout << "y=" << y << "\n";
cout << "z=" << z << "\n";
cout << "b=" << b << "\n";
cout << "i=" << i << "\n";
-
+
return 0;
}
diff --git a/example_float.cc b/example_float.cc
index fed91c7..4feea0e 100644
--- a/example_float.cc
+++ b/example_float.cc
@@ -7,20 +7,18 @@
using namespace std;
using namespace vecmathlib;
-
-
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
// Declare a float precision vector with an architecture-dependent
// number of elements
float32_vec x;
// Set each element separately. This is inefficient and should be
// avoided if possible, but we want to demonstrate it here anyway.
- for (int i=0; i<float32_vec::size; ++i) x.set_elt(i, float(i));
+ for (int i = 0; i < float32_vec::size; ++i)
+ x.set_elt(i, float(i));
float32_vec y = x + float32_vec(1.0);
y = sqrt(y);
float32_vec z = log(y);
-
+
// Boolean vectors are closely related to either float or float
// vectors, thus we need to make a distinction
bool32_vec b = x < y;
@@ -29,12 +27,12 @@ int main(int argc, char** argv)
// corresponding to "float32_vec", and there is "int_vec"
// correpsonding to "float32_vec".
int32_vec i = convert_int(y);
-
+
cout << "x=" << x << "\n";
cout << "y=" << y << "\n";
cout << "z=" << z << "\n";
cout << "b=" << b << "\n";
cout << "i=" << i << "\n";
-
+
return 0;
}
diff --git a/floatbuiltins.h b/floatbuiltins.h
index ee076a2..a7dd6f1 100644
--- a/floatbuiltins.h
+++ b/floatbuiltins.h
@@ -6,323 +6,383 @@
#if defined __clang__
namespace vecmathlib {
-
- inline char builtin_abs(char x) { return __builtin_abs(x); }
- inline short builtin_abs(short x) { return __builtin_abs(x); }
- inline int builtin_abs(int x) { return __builtin_abs(x); }
- inline long builtin_abs(long x) { return __builtin_labs(x); }
+
+inline char builtin_abs(char x) { return __builtin_abs(x); }
+inline short builtin_abs(short x) { return __builtin_abs(x); }
+inline int builtin_abs(int x) { return __builtin_abs(x); }
+inline long builtin_abs(long x) { return __builtin_labs(x); }
#if __SIZEOF_LONG_LONG__
- inline long long builtin_abs(long long x) { return __builtin_llabs(x); }
+inline long long builtin_abs(long long x) { return __builtin_llabs(x); }
#endif
-
- inline unsigned char builtin_clz(unsigned char x) { return __builtin_clzs(x) - CHAR_BIT * (sizeof(unsigned short) - sizeof(unsigned char)); }
- inline unsigned short builtin_clz(unsigned short x) { return __builtin_clzs(x); }
- inline unsigned int builtin_clz(unsigned int x) { return __builtin_clz(x); }
- inline unsigned long builtin_clz(unsigned long x) { return __builtin_clzl(x); }
+
+inline unsigned char builtin_clz(unsigned char x) {
+ return __builtin_clzs(x) -
+ CHAR_BIT * (sizeof(unsigned short) - sizeof(unsigned char));
+}
+inline unsigned short builtin_clz(unsigned short x) {
+ return __builtin_clzs(x);
+}
+inline unsigned int builtin_clz(unsigned int x) { return __builtin_clz(x); }
+inline unsigned long builtin_clz(unsigned long x) { return __builtin_clzl(x); }
#if __SIZEOF_LONG_LONG__
- inline unsigned long long builtin_clz(unsigned long long x) { return __builtin_clzll(x); }
+inline unsigned long long builtin_clz(unsigned long long x) {
+ return __builtin_clzll(x);
+}
#endif
-
- inline unsigned char builtin_popcount(unsigned char x) { return __builtin_popcount(x); }
- inline unsigned short builtin_popcount(unsigned short x) { return __builtin_popcount(x); }
- inline unsigned int builtin_popcount(unsigned int x) { return __builtin_popcount(x); }
- inline unsigned long builtin_popcount(unsigned long x) { return __builtin_popcountl(x); }
+
+inline unsigned char builtin_popcount(unsigned char x) {
+ return __builtin_popcount(x);
+}
+inline unsigned short builtin_popcount(unsigned short x) {
+ return __builtin_popcount(x);
+}
+inline unsigned int builtin_popcount(unsigned int x) {
+ return __builtin_popcount(x);
+}
+inline unsigned long builtin_popcount(unsigned long x) {
+ return __builtin_popcountl(x);
+}
#if __SIZEOF_LONG_LONG__
- inline unsigned long long builtin_popcount(unsigned long long x) { return __builtin_popcountll(x); }
+inline unsigned long long builtin_popcount(unsigned long long x) {
+ return __builtin_popcountll(x);
+}
#endif
-
-
-
- inline float builtin_acos(float x) { return __builtin_acosf(x); }
- inline double builtin_acos(double x) { return __builtin_acos(x); }
+
+inline float builtin_acos(float x) { return __builtin_acosf(x); }
+inline double builtin_acos(double x) { return __builtin_acos(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_acos(long double x) { return __builtin_acosl(x); }
+inline long double builtin_acos(long double x) { return __builtin_acosl(x); }
#endif
-
- inline float builtin_acosh(float x) { return __builtin_acoshf(x); }
- inline double builtin_acosh(double x) { return __builtin_acosh(x); }
+
+inline float builtin_acosh(float x) { return __builtin_acoshf(x); }
+inline double builtin_acosh(double x) { return __builtin_acosh(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_acosh(long double x) { return __builtin_acoshl(x); }
+inline long double builtin_acosh(long double x) { return __builtin_acoshl(x); }
#endif
-
- inline float builtin_asin(float x) { return __builtin_asinf(x); }
- inline double builtin_asin(double x) { return __builtin_asin(x); }
+
+inline float builtin_asin(float x) { return __builtin_asinf(x); }
+inline double builtin_asin(double x) { return __builtin_asin(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_asin(long double x) { return __builtin_asinl(x); }
+inline long double builtin_asin(long double x) { return __builtin_asinl(x); }
#endif
-
- inline float builtin_asinh(float x) { return __builtin_asinhf(x); }
- inline double builtin_asinh(double x) { return __builtin_asinh(x); }
+
+inline float builtin_asinh(float x) { return __builtin_asinhf(x); }
+inline double builtin_asinh(double x) { return __builtin_asinh(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_asinh(long double x) { return __builtin_asinhl(x); }
+inline long double builtin_asinh(long double x) { return __builtin_asinhl(x); }
#endif
-
- inline float builtin_atan(float x) { return __builtin_atanf(x); }
- inline double builtin_atan(double x) { return __builtin_atan(x); }
+
+inline float builtin_atan(float x) { return __builtin_atanf(x); }
+inline double builtin_atan(double x) { return __builtin_atan(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_atan(long double x) { return __builtin_atanl(x); }
+inline long double builtin_atan(long double x) { return __builtin_atanl(x); }
#endif
-
- inline float builtin_atan2(float x, float y) { return __builtin_atan2f(x, y); }
- inline double builtin_atan2(double x, double y) { return __builtin_atan2(x, y); }
+
+inline float builtin_atan2(float x, float y) { return __builtin_atan2f(x, y); }
+inline double builtin_atan2(double x, double y) {
+ return __builtin_atan2(x, y);
+}
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_atan2(long double x, long double y) { return __builtin_atan2l(x, y); }
+inline long double builtin_atan2(long double x, long double y) {
+ return __builtin_atan2l(x, y);
+}
#endif
-
- inline float builtin_atanh(float x) { return __builtin_atanhf(x); }
- inline double builtin_atanh(double x) { return __builtin_atanh(x); }
+
+inline float builtin_atanh(float x) { return __builtin_atanhf(x); }
+inline double builtin_atanh(double x) { return __builtin_atanh(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_atanh(long double x) { return __builtin_atanhl(x); }
+inline long double builtin_atanh(long double x) { return __builtin_atanhl(x); }
#endif
-
- inline float builtin_cbrt(float x) { return __builtin_cbrtf(x); }
- inline double builtin_cbrt(double x) { return __builtin_cbrt(x); }
+
+inline float builtin_cbrt(float x) { return __builtin_cbrtf(x); }
+inline double builtin_cbrt(double x) { return __builtin_cbrt(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_cbrt(long double x) { return __builtin_cbrtl(x); }
+inline long double builtin_cbrt(long double x) { return __builtin_cbrtl(x); }
#endif
-
- inline float builtin_ceil(float x) { return __builtin_ceilf(x); }
- inline double builtin_ceil(double x) { return __builtin_ceil(x); }
+
+inline float builtin_ceil(float x) { return __builtin_ceilf(x); }
+inline double builtin_ceil(double x) { return __builtin_ceil(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_ceil(long double x) { return __builtin_ceill(x); }
+inline long double builtin_ceil(long double x) { return __builtin_ceill(x); }
#endif
-
- inline float builtin_copysign(float x, float y) { return __builtin_copysignf(x, y); }
- inline double builtin_copysign(double x, double y) { return __builtin_copysign(x, y); }
+
+inline float builtin_copysign(float x, float y) {
+ return __builtin_copysignf(x, y);
+}
+inline double builtin_copysign(double x, double y) {
+ return __builtin_copysign(x, y);
+}
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_copysign(long double x, long double y) { return __builtin_copysignl(x, y); }
+inline long double builtin_copysign(long double x, long double y) {
+ return __builtin_copysignl(x, y);
+}
#endif
- inline float builtin_cos(float x) { return __builtin_cosf(x); }
- inline double builtin_cos(double x) { return __builtin_cos(x); }
+inline float builtin_cos(float x) { return __builtin_cosf(x); }
+inline double builtin_cos(double x) { return __builtin_cos(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_cos(long double x) { return __builtin_cosl(x); }
+inline long double builtin_cos(long double x) { return __builtin_cosl(x); }
#endif
-
- inline float builtin_cosh(float x) { return __builtin_coshf(x); }
- inline double builtin_cosh(double x) { return __builtin_cosh(x); }
+
+inline float builtin_cosh(float x) { return __builtin_coshf(x); }
+inline double builtin_cosh(double x) { return __builtin_cosh(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_cosh(long double x) { return __builtin_coshl(x); }
+inline long double builtin_cosh(long double x) { return __builtin_coshl(x); }
#endif
- inline float builtin_exp(float x) { return __builtin_expf(x); }
- inline double builtin_exp(double x) { return __builtin_exp(x); }
+inline float builtin_exp(float x) { return __builtin_expf(x); }
+inline double builtin_exp(double x) { return __builtin_exp(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_exp(long double x) { return __builtin_expl(x); }
+inline long double builtin_exp(long double x) { return __builtin_expl(x); }
#endif
-
- inline float builtin_exp2(float x) { return __builtin_exp2f(x); }
- inline double builtin_exp2(double x) { return __builtin_exp2(x); }
+
+inline float builtin_exp2(float x) { return __builtin_exp2f(x); }
+inline double builtin_exp2(double x) { return __builtin_exp2(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_exp2(long double x) { return __builtin_exp2l(x); }
+inline long double builtin_exp2(long double x) { return __builtin_exp2l(x); }
#endif
- inline float builtin_expm1(float x) { return __builtin_expm1f(x); }
- inline double builtin_expm1(double x) { return __builtin_expm1(x); }
+inline float builtin_expm1(float x) { return __builtin_expm1f(x); }
+inline double builtin_expm1(double x) { return __builtin_expm1(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_expm1(long double x) { return __builtin_expm1l(x); }
+inline long double builtin_expm1(long double x) { return __builtin_expm1l(x); }
#endif
- inline float builtin_fabs(float x) { return __builtin_fabsf(x); }
- inline double builtin_fabs(double x) { return __builtin_fabs(x); }
+inline float builtin_fabs(float x) { return __builtin_fabsf(x); }
+inline double builtin_fabs(double x) { return __builtin_fabs(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_fabs(long double x) { return __builtin_fabsl(x); }
+inline long double builtin_fabs(long double x) { return __builtin_fabsl(x); }
#endif
-
- inline float builtin_fdim(float x, float y) { return __builtin_fdimf(x, y); }
- inline double builtin_fdim(double x, double y) { return __builtin_fdim(x, y); }
+
+inline float builtin_fdim(float x, float y) { return __builtin_fdimf(x, y); }
+inline double builtin_fdim(double x, double y) { return __builtin_fdim(x, y); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_fdim(long double x, long double y) { return __builtin_fdiml(x, y); }
+inline long double builtin_fdim(long double x, long double y) {
+ return __builtin_fdiml(x, y);
+}
#endif
-
- inline float builtin_floor(float x) { return __builtin_floorf(x); }
- inline double builtin_floor(double x) { return __builtin_floor(x); }
+
+inline float builtin_floor(float x) { return __builtin_floorf(x); }
+inline double builtin_floor(double x) { return __builtin_floor(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_floor(long double x) { return __builtin_floorl(x); }
+inline long double builtin_floor(long double x) { return __builtin_floorl(x); }
#endif
-
- inline float builtin_fma(float x, float y, float z) { return __builtin_fmaf(x, y, z); }
- inline double builtin_fma(double x, double y, double z) { return __builtin_fma(x, y, z); }
+
+inline float builtin_fma(float x, float y, float z) {
+ return __builtin_fmaf(x, y, z);
+}
+inline double builtin_fma(double x, double y, double z) {
+ return __builtin_fma(x, y, z);
+}
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_fma(long double x, long double y, long double z) { return __builtin_fmal(x, y, z); }
+inline long double builtin_fma(long double x, long double y, long double z) {
+ return __builtin_fmal(x, y, z);
+}
#endif
-
- inline float builtin_fmax(float x, float y) { return __builtin_fmaxf(x, y); }
- inline double builtin_fmax(double x, double y) { return __builtin_fmax(x, y); }
+
+inline float builtin_fmax(float x, float y) { return __builtin_fmaxf(x, y); }
+inline double builtin_fmax(double x, double y) { return __builtin_fmax(x, y); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_fmax(long double x, long double y) { return __builtin_fmaxl(x, y); }
+inline long double builtin_fmax(long double x, long double y) {
+ return __builtin_fmaxl(x, y);
+}
#endif
-
- inline float builtin_fmin(float x, float y) { return __builtin_fminf(x, y); }
- inline double builtin_fmin(double x, double y) { return __builtin_fmin(x, y); }
+
+inline float builtin_fmin(float x, float y) { return __builtin_fminf(x, y); }
+inline double builtin_fmin(double x, double y) { return __builtin_fmin(x, y); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_fmin(long double x, long double y) { return __builtin_fminl(x, y); }
+inline long double builtin_fmin(long double x, long double y) {
+ return __builtin_fminl(x, y);
+}
#endif
-
- inline float builtin_fmod(float x, float y) { return __builtin_fmodf(x, y); }
- inline double builtin_fmod(double x, double y) { return __builtin_fmod(x, y); }
+
+inline float builtin_fmod(float x, float y) { return __builtin_fmodf(x, y); }
+inline double builtin_fmod(double x, double y) { return __builtin_fmod(x, y); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_fmod(long double x, long double y) { return __builtin_fmodl(x, y); }
+inline long double builtin_fmod(long double x, long double y) {
+ return __builtin_fmodl(x, y);
+}
#endif
-
- inline float builtin_frexp(float x, int* r) { return __builtin_frexpf(x, r); }
- inline double builtin_frexp(double x, int* r) { return __builtin_frexp(x, r); }
+
+inline float builtin_frexp(float x, int *r) { return __builtin_frexpf(x, r); }
+inline double builtin_frexp(double x, int *r) { return __builtin_frexp(x, r); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_frexp(long double x, int* r) { return __builtin_frexpl(x, r); }
+inline long double builtin_frexp(long double x, int *r) {
+ return __builtin_frexpl(x, r);
+}
#endif
-
- inline float builtin_hypot(float x, float y) { return __builtin_hypotf(x, y); }
- inline double builtin_hypot(double x, double y) { return __builtin_hypot(x, y); }
+
+inline float builtin_hypot(float x, float y) { return __builtin_hypotf(x, y); }
+inline double builtin_hypot(double x, double y) {
+ return __builtin_hypot(x, y);
+}
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_hypot(long double x, long double y) { return __builtin_hypotl(x, y); }
+inline long double builtin_hypot(long double x, long double y) {
+ return __builtin_hypotl(x, y);
+}
#endif
-
- inline int builtin_ilogb(float x) { return __builtin_ilogbf(x); }
- inline int builtin_ilogb(double x) { return __builtin_ilogb(x); }
+
+inline int builtin_ilogb(float x) { return __builtin_ilogbf(x); }
+inline int builtin_ilogb(double x) { return __builtin_ilogb(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline int builtin_ilogb(long double x) { return __builtin_ilogbl(x); }
+inline int builtin_ilogb(long double x) { return __builtin_ilogbl(x); }
#endif
-
- inline int builtin_isfinite(float x) { return __builtin_isfinite(x); }
- inline int builtin_isfinite(double x) { return __builtin_isfinite(x); }
+
+inline int builtin_isfinite(float x) { return __builtin_isfinite(x); }
+inline int builtin_isfinite(double x) { return __builtin_isfinite(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline int builtin_isfinite(long double x) { return __builtin_isfinite(x); }
+inline int builtin_isfinite(long double x) { return __builtin_isfinite(x); }
#endif
-
- inline int builtin_isinf(float x) { return __builtin_isinf(x); }
- inline int builtin_isinf(double x) { return __builtin_isinf(x); }
+
+inline int builtin_isinf(float x) { return __builtin_isinf(x); }
+inline int builtin_isinf(double x) { return __builtin_isinf(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline int builtin_isinf(long double x) { return __builtin_isinf(x); }
+inline int builtin_isinf(long double x) { return __builtin_isinf(x); }
#endif
-
- inline int builtin_isnan(float x) { return __builtin_isnan(x); }
- inline int builtin_isnan(double x) { return __builtin_isnan(x); }
+
+inline int builtin_isnan(float x) { return __builtin_isnan(x); }
+inline int builtin_isnan(double x) { return __builtin_isnan(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline int builtin_isnan(long double x) { return __builtin_isnan(x); }
+inline int builtin_isnan(long double x) { return __builtin_isnan(x); }
#endif
-
- inline int builtin_isnormal(float x) { return __builtin_isnormal(x); }
- inline int builtin_isnormal(double x) { return __builtin_isnormal(x); }
+
+inline int builtin_isnormal(float x) { return __builtin_isnormal(x); }
+inline int builtin_isnormal(double x) { return __builtin_isnormal(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline int builtin_isnormal(long double x) { return __builtin_isnormal(x); }
+inline int builtin_isnormal(long double x) { return __builtin_isnormal(x); }
#endif
-
- inline float builtin_ldexp(float x, int y) { return __builtin_ldexpf(x, y); }
- inline double builtin_ldexp(double x, int y) { return __builtin_ldexp(x, y); }
+
+inline float builtin_ldexp(float x, int y) { return __builtin_ldexpf(x, y); }
+inline double builtin_ldexp(double x, int y) { return __builtin_ldexp(x, y); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_ldexp(long double x, int y) { return __builtin_ldexpl(x, y); }
+inline long double builtin_ldexp(long double x, int y) {
+ return __builtin_ldexpl(x, y);
+}
#endif
-
- inline long long builtin_llrint(float x) { return __builtin_llrintf(x); }
- inline long long builtin_llrint(double x) { return __builtin_llrint(x); }
+
+inline long long builtin_llrint(float x) { return __builtin_llrintf(x); }
+inline long long builtin_llrint(double x) { return __builtin_llrint(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long long builtin_llrint(long double x) { return __builtin_llrintl(x); }
+inline long long builtin_llrint(long double x) { return __builtin_llrintl(x); }
#endif
- inline float builtin_log(float x) { return __builtin_logf(x); }
- inline double builtin_log(double x) { return __builtin_log(x); }
+inline float builtin_log(float x) { return __builtin_logf(x); }
+inline double builtin_log(double x) { return __builtin_log(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_log(long double x) { return __builtin_logl(x); }
+inline long double builtin_log(long double x) { return __builtin_logl(x); }
#endif
- inline float builtin_log10(float x) { return __builtin_log10f(x); }
- inline double builtin_log10(double x) { return __builtin_log10(x); }
+inline float builtin_log10(float x) { return __builtin_log10f(x); }
+inline double builtin_log10(double x) { return __builtin_log10(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_log10(long double x) { return __builtin_log10l(x); }
+inline long double builtin_log10(long double x) { return __builtin_log10l(x); }
#endif
- inline float builtin_log1p(float x) { return __builtin_log1pf(x); }
- inline double builtin_log1p(double x) { return __builtin_log1p(x); }
+inline float builtin_log1p(float x) { return __builtin_log1pf(x); }
+inline double builtin_log1p(double x) { return __builtin_log1p(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_log1p(long double x) { return __builtin_log1pl(x); }
+inline long double builtin_log1p(long double x) { return __builtin_log1pl(x); }
#endif
- inline float builtin_log2(float x) { return __builtin_log2f(x); }
- inline double builtin_log2(double x) { return __builtin_log2(x); }
+inline float builtin_log2(float x) { return __builtin_log2f(x); }
+inline double builtin_log2(double x) { return __builtin_log2(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_log2(long double x) { return __builtin_log2l(x); }
+inline long double builtin_log2(long double x) { return __builtin_log2l(x); }
#endif
-
- inline long builtin_lrint(float x) { return __builtin_lrintf(x); }
- inline long builtin_lrint(double x) { return __builtin_lrint(x); }
+
+inline long builtin_lrint(float x) { return __builtin_lrintf(x); }
+inline long builtin_lrint(double x) { return __builtin_lrint(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long builtin_lrint(long double x) { return __builtin_lrintl(x); }
+inline long builtin_lrint(long double x) { return __builtin_lrintl(x); }
#endif
-
- inline float builtin_nextafter(float x, float y) { return __builtin_nextafterf(x, y); }
- inline double builtin_nextafter(double x, double y) { return __builtin_nextafter(x, y); }
+
+inline float builtin_nextafter(float x, float y) {
+ return __builtin_nextafterf(x, y);
+}
+inline double builtin_nextafter(double x, double y) {
+ return __builtin_nextafter(x, y);
+}
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_nextafter(long double x, long double y) { return __builtin_nextafterl(x, y); }
+inline long double builtin_nextafter(long double x, long double y) {
+ return __builtin_nextafterl(x, y);
+}
#endif
-
- inline float builtin_pow(float x, float y) { return __builtin_powf(x, y); }
- inline double builtin_pow(double x, double y) { return __builtin_pow(x, y); }
+
+inline float builtin_pow(float x, float y) { return __builtin_powf(x, y); }
+inline double builtin_pow(double x, double y) { return __builtin_pow(x, y); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_pow(long double x, long double y) { return __builtin_powl(x, y); }
+inline long double builtin_pow(long double x, long double y) {
+ return __builtin_powl(x, y);
+}
#endif
-
- inline float builtin_remainder(float x, float y) { return __builtin_remainderf(x, y); }
- inline double builtin_remainder(double x, double y) { return __builtin_remainder(x, y); }
+
+inline float builtin_remainder(float x, float y) {
+ return __builtin_remainderf(x, y);
+}
+inline double builtin_remainder(double x, double y) {
+ return __builtin_remainder(x, y);
+}
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_remainder(long double x, long double y) { return __builtin_remainderl(x, y); }
+inline long double builtin_remainder(long double x, long double y) {
+ return __builtin_remainderl(x, y);
+}
#endif
- inline float builtin_rint(float x) { return __builtin_rintf(x); }
- inline double builtin_rint(double x) { return __builtin_rint(x); }
+inline float builtin_rint(float x) { return __builtin_rintf(x); }
+inline double builtin_rint(double x) { return __builtin_rint(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_rint(long double x) { return __builtin_rintl(x); }
+inline long double builtin_rint(long double x) { return __builtin_rintl(x); }
#endif
- inline float builtin_round(float x) { return __builtin_roundf(x); }
- inline double builtin_round(double x) { return __builtin_round(x); }
+inline float builtin_round(float x) { return __builtin_roundf(x); }
+inline double builtin_round(double x) { return __builtin_round(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_round(long double x) { return __builtin_roundl(x); }
+inline long double builtin_round(long double x) { return __builtin_roundl(x); }
#endif
-
- inline int builtin_signbit(float x) { return __builtin_signbitf(x); }
- inline int builtin_signbit(double x) { return __builtin_signbit(x); }
+
+inline int builtin_signbit(float x) { return __builtin_signbitf(x); }
+inline int builtin_signbit(double x) { return __builtin_signbit(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline int builtin_signbit(long double x) { return __builtin_signbitl(x); }
+inline int builtin_signbit(long double x) { return __builtin_signbitl(x); }
#endif
- inline float builtin_sin(float x) { return __builtin_sinf(x); }
- inline double builtin_sin(double x) { return __builtin_sin(x); }
+inline float builtin_sin(float x) { return __builtin_sinf(x); }
+inline double builtin_sin(double x) { return __builtin_sin(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_sin(long double x) { return __builtin_sinl(x); }
+inline long double builtin_sin(long double x) { return __builtin_sinl(x); }
#endif
-
- inline float builtin_sinh(float x) { return __builtin_sinhf(x); }
- inline double builtin_sinh(double x) { return __builtin_sinh(x); }
+
+inline float builtin_sinh(float x) { return __builtin_sinhf(x); }
+inline double builtin_sinh(double x) { return __builtin_sinh(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_sinh(long double x) { return __builtin_sinhl(x); }
+inline long double builtin_sinh(long double x) { return __builtin_sinhl(x); }
#endif
-
- inline float builtin_sqrt(float x) { return __builtin_sqrtf(x); }
- inline double builtin_sqrt(double x) { return __builtin_sqrt(x); }
+
+inline float builtin_sqrt(float x) { return __builtin_sqrtf(x); }
+inline double builtin_sqrt(double x) { return __builtin_sqrt(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_sqrt(long double x) { return __builtin_sqrtl(x); }
+inline long double builtin_sqrt(long double x) { return __builtin_sqrtl(x); }
#endif
- inline float builtin_tan(float x) { return __builtin_tanf(x); }
- inline double builtin_tan(double x) { return __builtin_tan(x); }
+inline float builtin_tan(float x) { return __builtin_tanf(x); }
+inline double builtin_tan(double x) { return __builtin_tan(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_tan(long double x) { return __builtin_tanl(x); }
+inline long double builtin_tan(long double x) { return __builtin_tanl(x); }
#endif
-
- inline float builtin_tanh(float x) { return __builtin_tanhf(x); }
- inline double builtin_tanh(double x) { return __builtin_tanh(x); }
+
+inline float builtin_tanh(float x) { return __builtin_tanhf(x); }
+inline double builtin_tanh(double x) { return __builtin_tanh(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_tanh(long double x) { return __builtin_tanhl(x); }
+inline long double builtin_tanh(long double x) { return __builtin_tanhl(x); }
#endif
-
- inline float builtin_trunc(float x) { return __builtin_truncf(x); }
- inline double builtin_trunc(double x) { return __builtin_trunc(x); }
+
+inline float builtin_trunc(float x) { return __builtin_truncf(x); }
+inline double builtin_trunc(double x) { return __builtin_trunc(x); }
#if __SIZEOF_LONG_DOUBLE__
- inline long double builtin_trunc(long double x) { return __builtin_truncl(x); }
+inline long double builtin_trunc(long double x) { return __builtin_truncl(x); }
#endif
-
}
#endif
-#endif // #ifndef FLOATBUILTINS_H
+#endif // #ifndef FLOATBUILTINS_H
diff --git a/floatprops.h b/floatprops.h
index f1c39a2..c7a3b7f 100644
--- a/floatprops.h
+++ b/floatprops.h
@@ -10,310 +10,279 @@
#include <cstring>
#include <limits>
+namespace vecmathlib {
+// A structure describing various properties of a floating point
+// type. Most properties are already described in numeric_limits, so
+// we inherit it.
+template <typename real_t> struct floatprops {
+ // Some interesting properties are:
+ // min
+ // max
+ // digits
+ // epsilon
+ // min_exponent
+ // max_exponent
+ // infinity
+ // quiet_NaN
+};
-namespace vecmathlib {
-
- // A structure describing various properties of a floating point
- // type. Most properties are already described in numeric_limits, so
- // we inherit it.
- template<typename real_t>
- struct floatprops {
- // Some interesting properties are:
- // min
- // max
- // digits
- // epsilon
- // min_exponent
- // max_exponent
- // infinity
- // quiet_NaN
- };
-
-
-
- // Properties of fp8
- template<>
- struct floatprops<fp8> {
- typedef fp8 real_t;
- typedef vml_std::int8_t int_t;
- typedef vml_std::uint8_t uint_t;
-
- static char const* name() { return "fp8"; }
-
- // Definitions that might come from numeric_limits<> instead:
- static real_t min() { __builtin_unreachable(); }
- static real_t max() { __builtin_unreachable(); }
- static int const digits = 4;
- static real_t epsilon() { __builtin_unreachable(); }
- static int const min_exponent = -6;
- static int const max_exponent = 7;
- static real_t infinity() { __builtin_unreachable(); }
- static real_t quiet_NaN() { __builtin_unreachable(); }
-
- // Ensure the sizes match
- static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
- static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
-
- // Number of bits in internal representation
- static int const bits = 8 * sizeof(real_t);
- static int const mantissa_bits = digits - 1;
- static int const signbit_bits = 1;
- static int const exponent_bits = bits - mantissa_bits - signbit_bits;
- static int const exponent_offset = 2 - min_exponent;
- static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
- "error in bit counts");
- static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
- static uint_t const exponent_mask =
- ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
- static uint_t const signbit_mask = uint_t(1) << (bits-1);
- static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
- "error in masks");
- static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
- uint_t(~uint_t(0)),
- "error in masks");
-
- // Re-interpret bit patterns
- static real_t as_float(int_t x)
- {
- real_t res;
- std::memcpy(&res, &x, sizeof res);
- return res;
- }
- static int_t as_int(real_t x)
- {
- int_t res;
- std::memcpy(&res, &x, sizeof res);
- return res;
- }
- static int_t replicate_byte(unsigned char byte)
- {
- int_t res;
- std::memset(&res, byte, sizeof res);
- return res;
- }
-
- // Convert values (truncate)
- static real_t convert_float(int_t x) { __builtin_unreachable(); }
- static int_t convert_int(real_t x) { __builtin_unreachable(); }
- };
-
-
-
- // Properties of fp16
- template<>
- struct floatprops<fp16> {
- typedef fp16 real_t;
- typedef vml_std::int16_t int_t;
- typedef vml_std::uint16_t uint_t;
-
- static char const* name() { return "fp16"; }
-
- // Definitions that might come from numeric_limits<> instead:
- static real_t min() { __builtin_unreachable(); }
- static real_t max() { __builtin_unreachable(); }
- static int const digits = 11;
- static real_t epsilon() { __builtin_unreachable(); }
- static int const min_exponent = -14;
- static int const max_exponent = 15;
- static real_t infinity() { __builtin_unreachable(); }
- static real_t quiet_NaN() { __builtin_unreachable(); }
-
- // Ensure the sizes match
- static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
- static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
-
- // Number of bits in internal representation
- static int const bits = 8 * sizeof(real_t);
- static int const mantissa_bits = digits - 1;
- static int const signbit_bits = 1;
- static int const exponent_bits = bits - mantissa_bits - signbit_bits;
- static int const exponent_offset = 2 - min_exponent;
- static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
- "error in bit counts");
- static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
- static uint_t const exponent_mask =
- ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
- static uint_t const signbit_mask = uint_t(1) << (bits-1);
- static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
- "error in masks");
- static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
- uint_t(~uint_t(0)),
- "error in masks");
-
- // Re-interpret bit patterns
- static real_t as_float(int_t x)
- {
- real_t res;
- std::memcpy(&res, &x, sizeof res);
- return res;
- }
- static int_t as_int(real_t x)
- {
- int_t res;
- std::memcpy(&res, &x, sizeof res);
- return res;
- }
- static int_t replicate_byte(unsigned char byte)
- {
- int_t res;
- std::memset(&res, byte, sizeof res);
- return res;
- }
-
- // Convert values (truncate)
- static real_t convert_float(int_t x) { __builtin_unreachable(); }
- static int_t convert_int(real_t x) { __builtin_unreachable(); }
- };
-
-
-
- // Properties of float
- template<>
- struct floatprops<float>: std::numeric_limits<float> {
- typedef float real_t;
- typedef vml_std::int32_t int_t;
- typedef vml_std::uint32_t uint_t;
-
- static char const* name() { return "float"; }
-
- // Ensure the internal representation is what we expect
- static_assert(is_signed, "real_t is not signed");
- static_assert(radix==2, "real_t is not binary");
-
- // Ensure the sizes match
- static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
- static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
-
- // Number of bits in internal representation
- static int const bits = 8 * sizeof(real_t);
- static int const mantissa_bits = digits - 1;
- static int const signbit_bits = 1;
- static int const exponent_bits = bits - mantissa_bits - signbit_bits;
- static int const exponent_offset = 2 - min_exponent;
- static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
- "error in bit counts");
- static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
- static uint_t const exponent_mask =
- ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
- static uint_t const signbit_mask = uint_t(1) << (bits-1);
- static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
- "error in masks");
- static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
- uint_t(~uint_t(0)),
- "error in masks");
-
- // Re-interpret bit patterns
- static real_t as_float(int_t x)
- {
- real_t res;
- std::memcpy(&res, &x, sizeof res);
- return res;
- }
- static int_t as_int(real_t x)
- {
- int_t res;
- std::memcpy(&res, &x, sizeof res);
- return res;
- }
- static int_t replicate_byte(unsigned char byte)
- {
- int_t res;
- std::memset(&res, byte, sizeof res);
- return res;
- }
-
- // Convert values (truncate)
- static real_t convert_float(int_t x) { return real_t(x); }
- static int_t convert_int(real_t x) { return int_t(x); }
- };
-
-
-
- // Properties of double
- template<>
- struct floatprops<double>: std::numeric_limits<double> {
- typedef double real_t;
- typedef vml_std::int64_t int_t;
- typedef vml_std::uint64_t uint_t;
-
- static char const* name() { return "double"; }
-
- // Ensure the internal representation is what we expect
- static_assert(is_signed, "real_t is not signed");
- static_assert(radix==2, "real_t is not binary");
-
- // Ensure the sizes match
- static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
- static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
-
- // Number of bits in internal representation
- static int const bits = 8 * sizeof(real_t);
- static int const mantissa_bits = digits - 1;
- static int const signbit_bits = 1;
- static int const exponent_bits = bits - mantissa_bits - signbit_bits;
- static int const exponent_offset = 2 - min_exponent;
- static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
- "error in bit counts");
- static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
- static uint_t const exponent_mask =
- ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
- static uint_t const signbit_mask = uint_t(1) << (bits-1);
- static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
- "error in masks");
- static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
- uint_t(~uint_t(0)),
- "error in masks");
-
- // Re-interpret bit patterns
- static real_t as_float(int_t x)
- {
- real_t res;
- std::memcpy(&res, &x, sizeof res);
- return res;
- }
- static int_t as_int(real_t x)
- {
- int_t res;
- std::memcpy(&res, &x, sizeof res);
- return res;
- }
- static int_t replicate_byte(unsigned char byte)
- {
- int_t res;
- std::memset(&res, byte, sizeof res);
- return res;
- }
-
- // Convert values (truncate)
- static real_t convert_float(int_t x) { return real_t(x); }
- static int_t convert_int(real_t x) { return int_t(x); }
- };
-
-
-
- // We are adding the (unused) type RV here to avoid name mangling
- // problems. On some systems, the vector size does not enter into
- // the mangled name (!), leading to duplicate function definitions.
- template<typename RV, typename V, typename E>
- E get_elt(const V& v, const int n)
- {
- const size_t s = sizeof(E);
- E e;
- // assert(n>=0 and s*n<sizeof(V));
- std::memcpy(&e, &((const char*)&v)[s*n], s);
- return e;
+// Properties of fp8
+template <> struct floatprops<fp8> {
+ typedef fp8 real_t;
+ typedef vml_std::int8_t int_t;
+ typedef vml_std::uint8_t uint_t;
+
+ static char const *name() { return "fp8"; }
+
+ // Definitions that might come from numeric_limits<> instead:
+ static real_t min() { __builtin_unreachable(); }
+ static real_t max() { __builtin_unreachable(); }
+ static int const digits = 4;
+ static real_t epsilon() { __builtin_unreachable(); }
+ static int const min_exponent = -6;
+ static int const max_exponent = 7;
+ static real_t infinity() { __builtin_unreachable(); }
+ static real_t quiet_NaN() { __builtin_unreachable(); }
+
+ // Ensure the sizes match
+ static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+ static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+ // Number of bits in internal representation
+ static int const bits = 8 * sizeof(real_t);
+ static int const mantissa_bits = digits - 1;
+ static int const signbit_bits = 1;
+ static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+ static int const exponent_offset = 2 - min_exponent;
+ static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+ "error in bit counts");
+ static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+ static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1)
+ << mantissa_bits;
+ static uint_t const signbit_mask = uint_t(1) << (bits - 1);
+ static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+ "error in masks");
+ static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+ uint_t(~uint_t(0)),
+ "error in masks");
+
+ // Re-interpret bit patterns
+ static real_t as_float(int_t x) {
+ real_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+ static int_t as_int(real_t x) {
+ int_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+ static int_t replicate_byte(unsigned char byte) {
+ int_t res;
+ std::memset(&res, byte, sizeof res);
+ return res;
+ }
+
+ // Convert values (truncate)
+ static real_t convert_float(int_t x) { __builtin_unreachable(); }
+ static int_t convert_int(real_t x) { __builtin_unreachable(); }
+};
+
+// Properties of fp16
+template <> struct floatprops<fp16> {
+ typedef fp16 real_t;
+ typedef vml_std::int16_t int_t;
+ typedef vml_std::uint16_t uint_t;
+
+ static char const *name() { return "fp16"; }
+
+ // Definitions that might come from numeric_limits<> instead:
+ static real_t min() { __builtin_unreachable(); }
+ static real_t max() { __builtin_unreachable(); }
+ static int const digits = 11;
+ static real_t epsilon() { __builtin_unreachable(); }
+ static int const min_exponent = -14;
+ static int const max_exponent = 15;
+ static real_t infinity() { __builtin_unreachable(); }
+ static real_t quiet_NaN() { __builtin_unreachable(); }
+
+ // Ensure the sizes match
+ static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+ static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+ // Number of bits in internal representation
+ static int const bits = 8 * sizeof(real_t);
+ static int const mantissa_bits = digits - 1;
+ static int const signbit_bits = 1;
+ static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+ static int const exponent_offset = 2 - min_exponent;
+ static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+ "error in bit counts");
+ static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+ static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1)
+ << mantissa_bits;
+ static uint_t const signbit_mask = uint_t(1) << (bits - 1);
+ static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+ "error in masks");
+ static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+ uint_t(~uint_t(0)),
+ "error in masks");
+
+ // Re-interpret bit patterns
+ static real_t as_float(int_t x) {
+ real_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+ static int_t as_int(real_t x) {
+ int_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+ static int_t replicate_byte(unsigned char byte) {
+ int_t res;
+ std::memset(&res, byte, sizeof res);
+ return res;
+ }
+
+ // Convert values (truncate)
+ static real_t convert_float(int_t x) { __builtin_unreachable(); }
+ static int_t convert_int(real_t x) { __builtin_unreachable(); }
+};
+
+// Properties of float
+template <> struct floatprops<float> : std::numeric_limits<float> {
+ typedef float real_t;
+ typedef vml_std::int32_t int_t;
+ typedef vml_std::uint32_t uint_t;
+
+ static char const *name() { return "float"; }
+
+ // Ensure the internal representation is what we expect
+ static_assert(is_signed, "real_t is not signed");
+ static_assert(radix == 2, "real_t is not binary");
+
+ // Ensure the sizes match
+ static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+ static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+ // Number of bits in internal representation
+ static int const bits = 8 * sizeof(real_t);
+ static int const mantissa_bits = digits - 1;
+ static int const signbit_bits = 1;
+ static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+ static int const exponent_offset = 2 - min_exponent;
+ static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+ "error in bit counts");
+ static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+ static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1)
+ << mantissa_bits;
+ static uint_t const signbit_mask = uint_t(1) << (bits - 1);
+ static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+ "error in masks");
+ static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+ uint_t(~uint_t(0)),
+ "error in masks");
+
+ // Re-interpret bit patterns
+ static real_t as_float(int_t x) {
+ real_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
}
-
- template<typename RV, typename V, typename E>
- V& set_elt(V& v, const int n, const E e)
- {
- const size_t s = sizeof(E);
- // assert(n>=0 and s*n<sizeof(V));
- std::memcpy(&((char*)&v)[s*n], &e, s);
- return v;
+ static int_t as_int(real_t x) {
+ int_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
}
-
+ static int_t replicate_byte(unsigned char byte) {
+ int_t res;
+ std::memset(&res, byte, sizeof res);
+ return res;
+ }
+
+ // Convert values (truncate)
+ static real_t convert_float(int_t x) { return real_t(x); }
+ static int_t convert_int(real_t x) { return int_t(x); }
+};
+
+// Properties of double
+template <> struct floatprops<double> : std::numeric_limits<double> {
+ typedef double real_t;
+ typedef vml_std::int64_t int_t;
+ typedef vml_std::uint64_t uint_t;
+
+ static char const *name() { return "double"; }
+
+ // Ensure the internal representation is what we expect
+ static_assert(is_signed, "real_t is not signed");
+ static_assert(radix == 2, "real_t is not binary");
+
+ // Ensure the sizes match
+ static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+ static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+ // Number of bits in internal representation
+ static int const bits = 8 * sizeof(real_t);
+ static int const mantissa_bits = digits - 1;
+ static int const signbit_bits = 1;
+ static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+ static int const exponent_offset = 2 - min_exponent;
+ static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+ "error in bit counts");
+ static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+ static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1)
+ << mantissa_bits;
+ static uint_t const signbit_mask = uint_t(1) << (bits - 1);
+ static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+ "error in masks");
+ static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+ uint_t(~uint_t(0)),
+ "error in masks");
+
+ // Re-interpret bit patterns
+ static real_t as_float(int_t x) {
+ real_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+ static int_t as_int(real_t x) {
+ int_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+ static int_t replicate_byte(unsigned char byte) {
+ int_t res;
+ std::memset(&res, byte, sizeof res);
+ return res;
+ }
+
+ // Convert values (truncate)
+ static real_t convert_float(int_t x) { return real_t(x); }
+ static int_t convert_int(real_t x) { return int_t(x); }
+};
+
+// We are adding the (unused) type RV here to avoid name mangling
+// problems. On some systems, the vector size does not enter into
+// the mangled name (!), leading to duplicate function definitions.
+template <typename RV, typename V, typename E>
+E get_elt(const V &v, const int n) {
+ const size_t s = sizeof(E);
+ E e;
+ // assert(n>=0 and s*n<sizeof(V));
+ std::memcpy(&e, &((const char *)&v)[s * n], s);
+ return e;
+}
+
+template <typename RV, typename V, typename E>
+V &set_elt(V &v, const int n, const E e) {
+ const size_t s = sizeof(E);
+ // assert(n>=0 and s*n<sizeof(V));
+ std::memcpy(&((char *)&v)[s * n], &e, s);
+ return v;
+}
+
} // namespace vecmathlib
-#endif // #ifndef FLOATPROPS_H
+#endif // #ifndef FLOATPROPS_H
diff --git a/floattypes.h b/floattypes.h
index 5107af6..e037b95 100644
--- a/floattypes.h
+++ b/floattypes.h
@@ -3,20 +3,14 @@
#ifndef FLOATTYPES_H
#define FLOATTYPES_H
-
-
#include <cassert>
#include <cstdlib>
-
-
-#if ! (defined __clang__ || defined __gcc__)
-# define __builtin_unreachable() (assert(0))
-# define __builtin_expect(expr, val) (expr)
+#if !(defined __clang__ || defined __gcc__)
+#define __builtin_unreachable() (assert(0))
+#define __builtin_expect(expr, val) (expr)
#endif
-
-
// We expect either 199711L or 201103L
#if __cplusplus >= 201103L
// C++11 is supported, use it
@@ -25,11 +19,9 @@
#include <cstdint>
namespace vml_std {
- using namespace std;
+using namespace std;
}
-
-
#else
// C++11 is not supported, work around the missing pieces
@@ -40,38 +32,35 @@ namespace vml_std {
#include <stdint.h>
#ifndef static_assert
-# define VML_CONCAT2(x, y) x##y
-# define VML_CONCAT(x, y) VML_CONCAT2(x, y)
-# define static_assert(cond, msg) \
- typedef int VML_CONCAT(vml_static_assert_, __LINE__)[(cond) ? 1 : -1] \
- __attribute__((__unused__))
+#define VML_CONCAT2(x, y) x##y
+#define VML_CONCAT(x, y) VML_CONCAT2(x, y)
+#define static_assert(cond, msg) typedef int VML_CONCAT( \
+ vml_static_assert_, __LINE__)[(cond) ? 1 : -1] __attribute__((__unused__))
#endif
-
-
// Capture libc macros, then undefine them
#ifndef isfinite
-# error "isfinite is not a macro"
+#error "isfinite is not a macro"
#endif
#ifndef isinf
-# error "isinf is not a macro"
+#error "isinf is not a macro"
#endif
#ifndef isnan
-# error "isnan is not a macro"
+#error "isnan is not a macro"
#endif
#ifndef isnormal
-# error "isnormal is not a macro"
+#error "isnormal is not a macro"
#endif
#ifndef signbit
-# error "signbit is not a macro"
+#error "signbit is not a macro"
#endif
namespace {
- template<typename T> inline int libc_isfinite(T x) { return isfinite(x); }
- template<typename T> inline int libc_isinf(T x) { return isinf(x); }
- template<typename T> inline int libc_isnan(T x) { return isnan(x); }
- template<typename T> inline int libc_isnormal(T x) { return isnormal(x); }
- template<typename T> inline int libc_signbit(T x) { return signbit(x); }
+template <typename T> inline int libc_isfinite(T x) { return isfinite(x); }
+template <typename T> inline int libc_isinf(T x) { return isinf(x); }
+template <typename T> inline int libc_isnan(T x) { return isnan(x); }
+template <typename T> inline int libc_isnormal(T x) { return isnormal(x); }
+template <typename T> inline int libc_signbit(T x) { return signbit(x); }
}
// Include this before undefining the macros below
@@ -83,153 +72,146 @@ namespace {
#undef isnormal
#undef signbit
-
-
namespace vml_std {
-
- // Make some type definitions from stdint.h available in std
- typedef ::uint8_t uint8_t;
- typedef ::int8_t int8_t;
- typedef ::uint16_t uint16_t;
- typedef ::int16_t int16_t;
- typedef ::uint32_t uint32_t;
- typedef ::int32_t int32_t;
+
+// Make some type definitions from stdint.h available in std
+typedef ::uint8_t uint8_t;
+typedef ::int8_t int8_t;
+typedef ::uint16_t uint16_t;
+typedef ::int16_t int16_t;
+typedef ::uint32_t uint32_t;
+typedef ::int32_t int32_t;
#if __SIZEOF_LONG__ == 8
- // Even if both "long" and "long long" have the same size, they are
- // still different types. In many cases, it is then preferable to
- // use "long" instead of "long long".
- typedef unsigned long uint64_t;
- typedef long int64_t;
+// Even if both "long" and "long long" have the same size, they are
+// still different types. In many cases, it is then preferable to
+// use "long" instead of "long long".
+typedef unsigned long uint64_t;
+typedef long int64_t;
#else
- typedef ::uint64_t uint64_t;
- typedef ::int64_t int64_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t int64_t;
#endif
-
-
-
- // Make math functions from math.h available in vml_std
- // (We could instead take some of them -- but not all -- from std.)
-
- inline float acos(float x) { return ::acosf(x); }
- inline float acosh(float x) { return ::acoshf(x); }
- inline float asin(float x) { return ::asinf(x); }
- inline float asinh(float x) { return ::asinhf(x); }
- inline float atan(float x) { return ::atanf(x); }
- inline float atan2(float x, float y) { return ::atan2f(x, y); }
- inline float atanh(float x) { return ::atanhf(x); }
- inline float cbrt(float x) { return ::cbrtf(x); }
- inline float ceil(float x) { return ::ceilf(x); }
- inline float cos(float x) { return ::cosf(x); }
- inline float cosh(float x) { return ::coshf(x); }
- inline float copysign(float x, float y) { return ::copysignf(x, y); }
- inline float exp(float x) { return ::expf(x); }
- inline float exp2(float x) { return ::exp2f(x); }
- inline float expm1(float x) { return ::expm1f(x); }
- inline float fabs(float x) { return ::fabsf(x); }
- inline float fdim(float x, float y) { return ::fdimf(x, y); }
- inline float floor(float x) { return ::floorf(x); }
- inline float fma(float x, float y, float z) { return ::fmaf(x, y, z); }
- inline float fmax(float x, float y) { return ::fmaxf(x, y); }
- inline float fmin(float x, float y) { return ::fminf(x, y); }
- inline float fmod(float x, float y) { return ::fmodf(x, y); }
- inline float frexp(float x, int* r) { return ::frexpf(x, r); }
- inline float hypot(float x, float y) { return ::hypotf(x, y); }
- inline int ilogb(float x) { return ::ilogbf(x); }
- inline bool isfinite(float x) { return libc_isfinite(x); }
- inline bool isinf(float x) { return libc_isinf(x); }
- inline bool isnan(float x) { return libc_isnan(x); }
- inline bool isnormal(float x) { return libc_isnormal(x); }
- inline float ldexp(float x, int n) { return ::ldexpf(x, n); }
- inline long long llrint(float x) { return ::llrintf(x); }
- inline float log(float x) { return ::logf(x); }
- inline float log10(float x) { return ::log10f(x); }
- inline float log1p(float x) { return ::log1pf(x); }
- inline float log2(float x) { return ::log2f(x); }
- inline long lrint(float x) { return ::lrintf(x); }
- inline float nextafter(float x, float y) { return ::nextafterf(x, y); }
- inline float pow(float x, float y) { return ::powf(x, y); }
- inline float remainder(float x, float y) { return ::remainderf(x, y); }
- inline float rint(float x) { return ::rintf(x); }
- inline float round(float x) { return ::roundf(x); }
- inline bool signbit(float x) { return libc_signbit(x); }
- inline float sin(float x) { return ::sinf(x); }
- inline float sinh(float x) { return ::sinhf(x); }
- inline float sqrt(float x) { return ::sqrtf(x); }
- inline float tan(float x) { return ::tanf(x); }
- inline float tanh(float x) { return ::tanhf(x); }
- inline float trunc(float x) { return ::truncf(x); }
-
- inline double acos(double x) { return ::acos(x); }
- inline double acosh(double x) { return ::acosh(x); }
- inline double asin(double x) { return ::asin(x); }
- inline double asinh(double x) { return ::asinh(x); }
- inline double atan(double x) { return ::atan(x); }
- inline double atan2(double x, double y) { return ::atan2(x, y); }
- inline double atanh(double x) { return ::atanh(x); }
- inline double cbrt(double x) { return ::cbrt(x); }
- inline double ceil(double x) { return ::ceil(x); }
- inline double cos(double x) { return ::cos(x); }
- inline double cosh(double x) { return ::cosh(x); }
- inline double copysign(double x, double y) { return ::copysign(x, y); }
- inline double exp(double x) { return ::exp(x); }
- inline double exp2(double x) { return ::exp2(x); }
- inline double expm1(double x) { return ::expm1(x); }
- inline double fabs(double x) { return ::fabs(x); }
- inline double fdim(double x, double y) { return ::fdim(x, y); }
- inline double floor(double x) { return ::floor(x); }
- inline double fma(double x, double y, double z) { return ::fma(x, y, z); }
- inline double fmax(double x, double y) { return ::fmax(x, y); }
- inline double fmin(double x, double y) { return ::fmin(x, y); }
- inline double fmod(double x, double y) { return ::fmod(x, y); }
- inline double frexp(double x, int* r) { return ::frexp(x, r); }
- inline double hypot(double x, double y) { return ::hypot(x, y); }
- inline int ilogb(double x) { return ::ilogb(x); }
- inline bool isfinite(double x) { return libc_isfinite(x); }
- inline bool isinf(double x) { return libc_isinf(x); }
- inline bool isnan(double x) { return libc_isnan(x); }
- inline bool isnormal(double x) { return libc_isnormal(x); }
- inline double ldexp(double x, int n) { return ::ldexp(x, n); }
- inline long long llrint(double x) { return ::llrint(x); }
- inline double log(double x) { return ::log(x); }
- inline double log10(double x) { return ::log10(x); }
- inline double log1p(double x) { return ::log1p(x); }
- inline double log2(double x) { return ::log2(x); }
- inline long lrint(double x) { return ::lrint(x); }
- inline double nextafter(double x, double y) { return ::nextafter(x, y); }
- inline double pow(double x, double y) { return ::pow(x, y); }
- inline double remainder(double x, double y) { return ::remainder(x, y); }
- inline double rint(double x) { return ::rint(x); }
- inline double round(double x) { return ::round(x); }
- inline bool signbit(double x) { return libc_signbit(x); }
- inline double sin(double x) { return ::sin(x); }
- inline double sinh(double x) { return ::sinh(x); }
- inline double sqrt(double x) { return ::sqrt(x); }
- inline double tan(double x) { return ::tan(x); }
- inline double tanh(double x) { return ::tanh(x); }
- inline double trunc(double x) { return ::trunc(x); }
-
+
+// Make math functions from math.h available in vml_std
+// (We could instead take some of them -- but not all -- from std.)
+
+inline float acos(float x) { return ::acosf(x); }
+inline float acosh(float x) { return ::acoshf(x); }
+inline float asin(float x) { return ::asinf(x); }
+inline float asinh(float x) { return ::asinhf(x); }
+inline float atan(float x) { return ::atanf(x); }
+inline float atan2(float x, float y) { return ::atan2f(x, y); }
+inline float atanh(float x) { return ::atanhf(x); }
+inline float cbrt(float x) { return ::cbrtf(x); }
+inline float ceil(float x) { return ::ceilf(x); }
+inline float cos(float x) { return ::cosf(x); }
+inline float cosh(float x) { return ::coshf(x); }
+inline float copysign(float x, float y) { return ::copysignf(x, y); }
+inline float exp(float x) { return ::expf(x); }
+inline float exp2(float x) { return ::exp2f(x); }
+inline float expm1(float x) { return ::expm1f(x); }
+inline float fabs(float x) { return ::fabsf(x); }
+inline float fdim(float x, float y) { return ::fdimf(x, y); }
+inline float floor(float x) { return ::floorf(x); }
+inline float fma(float x, float y, float z) { return ::fmaf(x, y, z); }
+inline float fmax(float x, float y) { return ::fmaxf(x, y); }
+inline float fmin(float x, float y) { return ::fminf(x, y); }
+inline float fmod(float x, float y) { return ::fmodf(x, y); }
+inline float frexp(float x, int *r) { return ::frexpf(x, r); }
+inline float hypot(float x, float y) { return ::hypotf(x, y); }
+inline int ilogb(float x) { return ::ilogbf(x); }
+inline bool isfinite(float x) { return libc_isfinite(x); }
+inline bool isinf(float x) { return libc_isinf(x); }
+inline bool isnan(float x) { return libc_isnan(x); }
+inline bool isnormal(float x) { return libc_isnormal(x); }
+inline float ldexp(float x, int n) { return ::ldexpf(x, n); }
+inline long long llrint(float x) { return ::llrintf(x); }
+inline float log(float x) { return ::logf(x); }
+inline float log10(float x) { return ::log10f(x); }
+inline float log1p(float x) { return ::log1pf(x); }
+inline float log2(float x) { return ::log2f(x); }
+inline long lrint(float x) { return ::lrintf(x); }
+inline float nextafter(float x, float y) { return ::nextafterf(x, y); }
+inline float pow(float x, float y) { return ::powf(x, y); }
+inline float remainder(float x, float y) { return ::remainderf(x, y); }
+inline float rint(float x) { return ::rintf(x); }
+inline float round(float x) { return ::roundf(x); }
+inline bool signbit(float x) { return libc_signbit(x); }
+inline float sin(float x) { return ::sinf(x); }
+inline float sinh(float x) { return ::sinhf(x); }
+inline float sqrt(float x) { return ::sqrtf(x); }
+inline float tan(float x) { return ::tanf(x); }
+inline float tanh(float x) { return ::tanhf(x); }
+inline float trunc(float x) { return ::truncf(x); }
+
+inline double acos(double x) { return ::acos(x); }
+inline double acosh(double x) { return ::acosh(x); }
+inline double asin(double x) { return ::asin(x); }
+inline double asinh(double x) { return ::asinh(x); }
+inline double atan(double x) { return ::atan(x); }
+inline double atan2(double x, double y) { return ::atan2(x, y); }
+inline double atanh(double x) { return ::atanh(x); }
+inline double cbrt(double x) { return ::cbrt(x); }
+inline double ceil(double x) { return ::ceil(x); }
+inline double cos(double x) { return ::cos(x); }
+inline double cosh(double x) { return ::cosh(x); }
+inline double copysign(double x, double y) { return ::copysign(x, y); }
+inline double exp(double x) { return ::exp(x); }
+inline double exp2(double x) { return ::exp2(x); }
+inline double expm1(double x) { return ::expm1(x); }
+inline double fabs(double x) { return ::fabs(x); }
+inline double fdim(double x, double y) { return ::fdim(x, y); }
+inline double floor(double x) { return ::floor(x); }
+inline double fma(double x, double y, double z) { return ::fma(x, y, z); }
+inline double fmax(double x, double y) { return ::fmax(x, y); }
+inline double fmin(double x, double y) { return ::fmin(x, y); }
+inline double fmod(double x, double y) { return ::fmod(x, y); }
+inline double frexp(double x, int *r) { return ::frexp(x, r); }
+inline double hypot(double x, double y) { return ::hypot(x, y); }
+inline int ilogb(double x) { return ::ilogb(x); }
+inline bool isfinite(double x) { return libc_isfinite(x); }
+inline bool isinf(double x) { return libc_isinf(x); }
+inline bool isnan(double x) { return libc_isnan(x); }
+inline bool isnormal(double x) { return libc_isnormal(x); }
+inline double ldexp(double x, int n) { return ::ldexp(x, n); }
+inline long long llrint(double x) { return ::llrint(x); }
+inline double log(double x) { return ::log(x); }
+inline double log10(double x) { return ::log10(x); }
+inline double log1p(double x) { return ::log1p(x); }
+inline double log2(double x) { return ::log2(x); }
+inline long lrint(double x) { return ::lrint(x); }
+inline double nextafter(double x, double y) { return ::nextafter(x, y); }
+inline double pow(double x, double y) { return ::pow(x, y); }
+inline double remainder(double x, double y) { return ::remainder(x, y); }
+inline double rint(double x) { return ::rint(x); }
+inline double round(double x) { return ::round(x); }
+inline bool signbit(double x) { return libc_signbit(x); }
+inline double sin(double x) { return ::sin(x); }
+inline double sinh(double x) { return ::sinh(x); }
+inline double sqrt(double x) { return ::sqrt(x); }
+inline double tan(double x) { return ::tan(x); }
+inline double tanh(double x) { return ::tanh(x); }
+inline double trunc(double x) { return ::trunc(x); }
}
#endif
+namespace vecmathlib {
+
+struct fp8 {
+ // 1 bit sign, 4 bits exponent, 3 bits mantissa, exponent offset 7 (?)
+ vml_std::uint8_t val;
+ fp8() {}
+ fp8(double x) { __builtin_unreachable(); }
+};
+struct fp16 {
+ // 1 bit sign, 5 bits exponent, 10 bits mantissa, exponent offset 15 (?)
+ vml_std::uint16_t val;
+ fp16() {}
+ fp16(double x) { __builtin_unreachable(); }
+};
-namespace vecmathlib {
-
- struct fp8 {
- // 1 bit sign, 4 bits exponent, 3 bits mantissa, exponent offset 7 (?)
- vml_std::uint8_t val;
- fp8() {}
- fp8(double x) { __builtin_unreachable(); }
- };
-
- struct fp16 {
- // 1 bit sign, 5 bits exponent, 10 bits mantissa, exponent offset 15 (?)
- vml_std::uint16_t val;
- fp16() {}
- fp16(double x) { __builtin_unreachable(); }
- };
-
} // namespace vecmathlib
-#endif // #ifndef FLOATTYPES_H
+#endif // #ifndef FLOATTYPES_H
diff --git a/instantiations.cc b/instantiations.cc
index 9bd5351..956e1b9 100644
--- a/instantiations.cc
+++ b/instantiations.cc
@@ -7,84 +7,105 @@
#include "vecmathlib.h"
+namespace vecmathlib {
+template <typename realvec_t, int n>
+typename realvec_t::real_t get_elt(realvec_t x) {
+ return x[n];
+}
+template <typename realvec_t, int n>
+realvec_t set_elt(realvec_t x, typename realvec_t::real_t a) {
+ return x.set_elt(n, a);
+}
+
+// template realbuiltinvec<float,1> fabs(realbuiltinvec<float,1> x);
+// template realbuiltinvec<float,1> fmin(realbuiltinvec<float,1> x,
+// realbuiltinvec<float,1> y);
+// template intbuiltinvec<float,1> lsr(intbuiltinvec<float,1> x,
+// intbuiltinvec<float,1>::int_t n);
+// template intbuiltinvec<double,1> lsr(intbuiltinvec<double,1> x,
+// intbuiltinvec<double,1>::int_t n);
+// template intbuiltinvec<double,2> lsr(intbuiltinvec<double,2> x,
+// intbuiltinvec<double,2>::int_t n);
+// template intbuiltinvec<double,2> lsr(intbuiltinvec<double,2> x,
+// intbuiltinvec<double,2> n);
+// template realbuiltinvec<float,1> ifthen(realbuiltinvec<float,1>::boolvec_t c,
+// realbuiltinvec<float,1> x, realbuiltinvec<float,1> y);
+// template realbuiltinvec<double,1> ifthen(realbuiltinvec<double,1>::boolvec_t
+// c, realbuiltinvec<double,1> x, realbuiltinvec<double,1> y);
+// template realbuiltinvec<float,4> ifthen(realbuiltinvec<float,4>::boolvec_t c,
+// realbuiltinvec<float,4> x, realbuiltinvec<float,4> y);
+// template realbuiltinvec<double,2> ifthen(realbuiltinvec<double,2>::boolvec_t
+// c, realbuiltinvec<double,2> x, realbuiltinvec<double,2> y);
-namespace vecmathlib {
-
- template<typename realvec_t, int n>
- typename realvec_t::real_t get_elt(realvec_t x)
- {
- return x[n];
- }
- template<typename realvec_t, int n>
- realvec_t set_elt(realvec_t x, typename realvec_t::real_t a)
- {
- return x.set_elt(n, a);
- }
-
- // template realbuiltinvec<float,1> fabs(realbuiltinvec<float,1> x);
- // template realbuiltinvec<float,1> fmin(realbuiltinvec<float,1> x, realbuiltinvec<float,1> y);
- // template intbuiltinvec<float,1> lsr(intbuiltinvec<float,1> x, intbuiltinvec<float,1>::int_t n);
- // template intbuiltinvec<double,1> lsr(intbuiltinvec<double,1> x, intbuiltinvec<double,1>::int_t n);
- // template intbuiltinvec<double,2> lsr(intbuiltinvec<double,2> x, intbuiltinvec<double,2>::int_t n);
- // template intbuiltinvec<double,2> lsr(intbuiltinvec<double,2> x, intbuiltinvec<double,2> n);
- // template realbuiltinvec<float,1> ifthen(realbuiltinvec<float,1>::boolvec_t c, realbuiltinvec<float,1> x, realbuiltinvec<float,1> y);
- // template realbuiltinvec<double,1> ifthen(realbuiltinvec<double,1>::boolvec_t c, realbuiltinvec<double,1> x, realbuiltinvec<double,1> y);
- // template realbuiltinvec<float,4> ifthen(realbuiltinvec<float,4>::boolvec_t c, realbuiltinvec<float,4> x, realbuiltinvec<float,4> y);
- // template realbuiltinvec<double,2> ifthen(realbuiltinvec<double,2>::boolvec_t c, realbuiltinvec<double,2> x, realbuiltinvec<double,2> y);
-
#ifdef VECMATHLIB_HAVE_VEC_FLOAT_1
- template realvec<float,1> round(realvec<float,1> x);
+template realvec<float, 1> round(realvec<float, 1> x);
#endif
-
+
#ifdef VECMATHLIB_HAVE_VEC_FLOAT_8
- template intvec<float,8> popcount(intvec<float,8>);
+template intvec<float, 8> popcount(intvec<float, 8>);
#endif
-
+
#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_1
- template realvec<double,1> exp(realvec<double,1> x);
- template realvec<double,1> log(realvec<double,1> x);
- template realvec<double,1> sin(realvec<double,1> x);
- template realvec<double,1> sqrt(realvec<double,1> x);
- template realvec<double,1>::real_t get_elt<realvec<double,1>,0>(realvec<double,1> x);
- template realvec<double,1> set_elt<realvec<double,1>,0>(realvec<double,1> x, realvec<double,1>::real_t a);
+template realvec<double, 1> exp(realvec<double, 1> x);
+template realvec<double, 1> log(realvec<double, 1> x);
+template realvec<double, 1> sin(realvec<double, 1> x);
+template realvec<double, 1> sqrt(realvec<double, 1> x);
+template realvec<double, 1>::real_t
+get_elt<realvec<double, 1>, 0>(realvec<double, 1> x);
+template realvec<double, 1>
+set_elt<realvec<double, 1>, 0>(realvec<double, 1> x,
+ realvec<double, 1>::real_t a);
#endif
-
+
#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_2
- template realvec<double,2> exp(realvec<double,2> x);
- template realvec<double,2> log(realvec<double,2> x);
- template realvec<double,2> sin(realvec<double,2> x);
- template realvec<double,2> sqrt(realvec<double,2> x);
- template realvec<double,2>::real_t get_elt<realvec<double,2>,0>(realvec<double,2>);
- template realvec<double,2>::real_t get_elt<realvec<double,2>,1>(realvec<double,2>);
- template realvec<double,2> set_elt<realvec<double,2>,0>(realvec<double,2> x, realvec<double,2>::real_t a);
- template realvec<double,2> set_elt<realvec<double,2>,1>(realvec<double,2> x, realvec<double,2>::real_t a);
+template realvec<double, 2> exp(realvec<double, 2> x);
+template realvec<double, 2> log(realvec<double, 2> x);
+template realvec<double, 2> sin(realvec<double, 2> x);
+template realvec<double, 2> sqrt(realvec<double, 2> x);
+template realvec<double, 2>::real_t
+get_elt<realvec<double, 2>, 0>(realvec<double, 2>);
+template realvec<double, 2>::real_t
+get_elt<realvec<double, 2>, 1>(realvec<double, 2>);
+template realvec<double, 2>
+set_elt<realvec<double, 2>, 0>(realvec<double, 2> x,
+ realvec<double, 2>::real_t a);
+template realvec<double, 2>
+set_elt<realvec<double, 2>, 1>(realvec<double, 2> x,
+ realvec<double, 2>::real_t a);
#endif
-
+
#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_4
- template realvec<double,4> exp(realvec<double,4> x);
- template realvec<double,4> log(realvec<double,4> x);
- template realvec<double,4> sin(realvec<double,4> x);
- template realvec<double,4> sqrt(realvec<double,4> x);
- template realvec<double,4>::real_t get_elt<realvec<double,4>,0>(realvec<double,4>);
- template realvec<double,4>::real_t get_elt<realvec<double,4>,1>(realvec<double,4>);
- template realvec<double,4>::real_t get_elt<realvec<double,4>,2>(realvec<double,4>);
- template realvec<double,4>::real_t get_elt<realvec<double,4>,3>(realvec<double,4>);
- template realvec<double,4> set_elt<realvec<double,4>,0>(realvec<double,4> x, realvec<double,4>::real_t a);
- template realvec<double,4> set_elt<realvec<double,4>,1>(realvec<double,4> x, realvec<double,4>::real_t a);
- template realvec<double,4> set_elt<realvec<double,4>,2>(realvec<double,4> x, realvec<double,4>::real_t a);
- template realvec<double,4> set_elt<realvec<double,4>,3>(realvec<double,4> x, realvec<double,4>::real_t a);
- template intvec<double,4> popcount(intvec<double,4>);
+template realvec<double, 4> exp(realvec<double, 4> x);
+template realvec<double, 4> log(realvec<double, 4> x);
+template realvec<double, 4> sin(realvec<double, 4> x);
+template realvec<double, 4> sqrt(realvec<double, 4> x);
+template realvec<double, 4>::real_t
+get_elt<realvec<double, 4>, 0>(realvec<double, 4>);
+template realvec<double, 4>::real_t
+get_elt<realvec<double, 4>, 1>(realvec<double, 4>);
+template realvec<double, 4>::real_t
+get_elt<realvec<double, 4>, 2>(realvec<double, 4>);
+template realvec<double, 4>::real_t
+get_elt<realvec<double, 4>, 3>(realvec<double, 4>);
+template realvec<double, 4>
+set_elt<realvec<double, 4>, 0>(realvec<double, 4> x,
+ realvec<double, 4>::real_t a);
+template realvec<double, 4>
+set_elt<realvec<double, 4>, 1>(realvec<double, 4> x,
+ realvec<double, 4>::real_t a);
+template realvec<double, 4>
+set_elt<realvec<double, 4>, 2>(realvec<double, 4> x,
+ realvec<double, 4>::real_t a);
+template realvec<double, 4>
+set_elt<realvec<double, 4>, 3>(realvec<double, 4> x,
+ realvec<double, 4>::real_t a);
+template intvec<double, 4> popcount(intvec<double, 4>);
#endif
-
}
-
-
// Various tests to detect auto-vectorization features
-
-
#include <cassert>
#include <cstdlib>
using namespace std;
@@ -92,32 +113,25 @@ using namespace std;
using namespace vecmathlib;
#if defined VECMATHLIB_HAVE_VEC_DOUBLE_4
-typedef realvec<double,4> realV;
+typedef realvec<double, 4> realV;
#elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2
-typedef realvec<double,2> realV;
+typedef realvec<double, 2> realV;
#elif defined VECMATHLIB_HAVE_VEC_FLOAT_8
-typedef realvec<float,8> realV;
+typedef realvec<float, 8> realV;
#elif defined VECMATHLIB_HAVE_VEC_FLOAT_4
-typedef realvec<float,4> realV;
+typedef realvec<float, 4> realV;
#elif defined VECMATHLIB_HAVE_VEC_FLOAT_2
-typedef realvec<float,2> realV;
+typedef realvec<float, 2> realV;
#else
-# error "There are no vector types"
+#error "There are no vector types"
#endif
typedef realV::scalar_t real;
const int vecsize = realV::size;
-
-
// Simple, naive loop adding two arrays
-extern "C"
-void loop_add(real* a,
- real* b,
- real* c,
- ptrdiff_t n)
-{
- for (ptrdiff_t i=0; i<n; i+=vecsize) {
+extern "C" void loop_add(real *a, real *b, real *c, ptrdiff_t n) {
+ for (ptrdiff_t i = 0; i < n; i += vecsize) {
realV tmpb = realV::loadu(&b[i]);
realV tmpc = realV::loadu(&c[i]);
realV tmpa = tmpb + tmpc;
@@ -125,16 +139,10 @@ void loop_add(real* a,
}
}
-
-
// Declare pointers as restrict
-extern "C"
-void loop_add_restrict(real *restrict a,
- real *restrict b,
- real *restrict c,
- ptrdiff_t n)
-{
- for (ptrdiff_t i=0; i<n; i+=vecsize) {
+extern "C" void loop_add_restrict(real *restrict a, real *restrict b,
+ real *restrict c, ptrdiff_t n) {
+ for (ptrdiff_t i = 0; i < n; i += vecsize) {
realV tmpb = realV::loadu(&b[i]);
realV tmpc = realV::loadu(&c[i]);
realV tmpa = tmpb + tmpc;
@@ -142,16 +150,10 @@ void loop_add_restrict(real *restrict a,
}
}
-
-
// Declare pointers as restrict and aligned
-extern "C"
-void loop_add_aligned(real *restrict a,
- real *restrict b,
- real *restrict c,
- ptrdiff_t n)
-{
- for (ptrdiff_t i=0; i<n; i+=vecsize) {
+extern "C" void loop_add_aligned(real *restrict a, real *restrict b,
+ real *restrict c, ptrdiff_t n) {
+ for (ptrdiff_t i = 0; i < n; i += vecsize) {
realV tmpb = realV::loada(&b[i]);
realV tmpc = realV::loada(&c[i]);
realV tmpa = tmpb + tmpc;
@@ -159,16 +161,11 @@ void loop_add_aligned(real *restrict a,
}
}
-
-
// Reduction loop
-extern "C"
-real loop_dot_reduce(real *restrict a,
- real *restrict b,
- ptrdiff_t n)
-{
+extern "C" real loop_dot_reduce(real *restrict a, real *restrict b,
+ ptrdiff_t n) {
realV sumV = 0.0;
- for (ptrdiff_t i=0; i<n; i+=vecsize) {
+ for (ptrdiff_t i = 0; i < n; i += vecsize) {
realV tmpa = realV::loada(&a[i]);
realV tmpb = realV::loada(&b[i]);
sumV += tmpa * tmpb;
@@ -176,16 +173,10 @@ real loop_dot_reduce(real *restrict a,
return sum(sumV);
}
-
-
// Loop with a simple if condition (fmax)
-extern "C"
-void loop_if_simple(real *restrict a,
- real *restrict b,
- real *restrict c,
- ptrdiff_t n)
-{
- for (ptrdiff_t i=0; i<n; i+=vecsize) {
+extern "C" void loop_if_simple(real *restrict a, real *restrict b,
+ real *restrict c, ptrdiff_t n) {
+ for (ptrdiff_t i = 0; i < n; i += vecsize) {
realV tmpb = realV::loada(&b[i]);
realV tmpc = realV::loada(&c[i]);
realV tmpa = ifthen(tmpb > tmpc, tmpb, tmpc);
@@ -193,16 +184,10 @@ void loop_if_simple(real *restrict a,
}
}
-
-
// Loop with a complex if condition (select)
-extern "C"
-void loop_if(real *restrict a,
- real *restrict b,
- real *restrict c,
- ptrdiff_t n)
-{
- for (ptrdiff_t i=0; i<n; i+=vecsize) {
+extern "C" void loop_if(real *restrict a, real *restrict b, real *restrict c,
+ ptrdiff_t n) {
+ for (ptrdiff_t i = 0; i < n; i += vecsize) {
realV tmpb = realV::loada(&b[i]);
realV tmpc = realV::loada(&c[i]);
realV tmpa = ifthen(tmpb > realV(0.0), tmpb * tmpc, realV(1.0));
@@ -210,16 +195,10 @@ void loop_if(real *restrict a,
}
}
-
-
// Skip ghost points
-extern "C"
-void loop_add_masked(real *restrict a,
- real *restrict b,
- real *restrict c,
- ptrdiff_t n)
-{
- for (realV::mask_t mask(1, n-1, 0); mask; ++mask) {
+extern "C" void loop_add_masked(real *restrict a, real *restrict b,
+ real *restrict c, ptrdiff_t n) {
+ for (realV::mask_t mask(1, n - 1, 0); mask; ++mask) {
ptrdiff_t i = mask.index();
realV tmpb = realV::loada(&b[i]);
realV tmpc = realV::loada(&c[i]);
diff --git a/interp.cc b/interp.cc
index 12bac0e..95e2cfa 100644
--- a/interp.cc
+++ b/interp.cc
@@ -13,12 +13,8 @@ typedef realvec_t::real_t real_t;
typedef realvec_t::intvec_t intvec_t;
typedef intvec_t::int_t int_t;
-
-
-realvec_t interp(const real_t* array, ptrdiff_t size,
- real_t xmin, real_t xmax,
- realvec_t x)
-{
+realvec_t interp(const real_t *array, ptrdiff_t size, real_t xmin, real_t xmax,
+ realvec_t x) {
assert(size >= 2);
// spacing
real_t dx = (xmax - xmin) / (size - 1);
@@ -29,11 +25,11 @@ realvec_t interp(const real_t* array, ptrdiff_t size,
intvec_t n = convert_int(cell);
// gather values from array
realvec_t x0, x1;
- for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+ for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
// ensure location is not out of bounds
- ptrdiff_t j = max(ptrdiff_t(0), min(size-2, ptrdiff_t(n[i])));
+ ptrdiff_t j = max(ptrdiff_t(0), min(size - 2, ptrdiff_t(n[i])));
x0.set_elt(i, array[j]);
- x1.set_elt(i, array[j+1]);
+ x1.set_elt(i, array[j + 1]);
}
// determine interpolation weights
realvec_t offset = scaled - cell;
@@ -44,20 +40,18 @@ realvec_t interp(const real_t* array, ptrdiff_t size,
return y;
}
-
-
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
ptrdiff_t size = 1001;
vector<real_t> array(size);
- for (ptrdiff_t i=0; i<size; ++i) array[i] = real_t(i) / 1000.0;
-
+ for (ptrdiff_t i = 0; i < size; ++i)
+ array[i] = real_t(i) / 1000.0;
+
real_t xmin = 0.0;
real_t xmax = 0.5;
realvec_t x = 0.333;
cout << "x=" << x << "\n";
realvec_t y = interp(&array[0], size, xmin, xmax, x);
cout << "y=" << y << "\n";
-
+
return 0;
}
diff --git a/loop.cc b/loop.cc
index ca8ebb8..8b42970 100644
--- a/loop.cc
+++ b/loop.cc
@@ -14,68 +14,57 @@
using namespace std;
using namespace vecmathlib;
-
-
////////////////////////////////////////////////////////////////////////////////
// Helpers
////////////////////////////////////////////////////////////////////////////////
#ifndef __has_builtin
-# define __has_builtin(x) 0 // Compatibility with non-clang compilers
+#define __has_builtin(x) 0 // Compatibility with non-clang compilers
#endif
// align upwards
-static size_t align_up(size_t i, size_t size)
-{
+static size_t align_up(size_t i, size_t size) {
return (i + size - 1) / size * size;
}
-
-
////////////////////////////////////////////////////////////////////////////////
// High-resolution timer
////////////////////////////////////////////////////////////////////////////////
typedef unsigned long long ticks;
-inline ticks getticks()
-{
+inline ticks getticks() {
#if __has_builtin(__builtin_readcyclecounter)
return __builtin_readcyclecounter();
#elif defined __x86_64__
ticks a, d;
- asm volatile("rdtsc" : "=a" (a), "=d" (d));
+ asm volatile("rdtsc" : "=a"(a), "=d"(d));
return a | (d << 32);
#elif defined __powerpc__
unsigned int tbl, tbu, tbu1;
do {
- asm volatile("mftbu %0": "=r"(tbu));
- asm volatile("mftb %0": "=r"(tbl));
- asm volatile("mftbu %0": "=r"(tbu1));
+ asm volatile("mftbu %0" : "=r"(tbu));
+ asm volatile("mftb %0" : "=r"(tbl));
+ asm volatile("mftbu %0" : "=r"(tbu1));
} while (tbu != tbu1);
return ((unsigned long long)tbu << 32) | tbl;
#else
timeval tv;
gettimeofday(&tv, NULL);
return 1000000ULL * tv.tv_sec + tv.tv_usec;
- // timespec ts;
- // clock_gettime(CLOCK_REALTIME, &ts);
- // return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
+// timespec ts;
+// clock_gettime(CLOCK_REALTIME, &ts);
+// return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
#endif
}
-inline double elapsed(ticks t1, ticks t0)
-{
- return t1-t0;
-}
+inline double elapsed(ticks t1, ticks t0) { return t1 - t0; }
-double get_sys_time()
-{
+double get_sys_time() {
timeval tp;
gettimeofday(&tp, NULL);
return tp.tv_sec + 1.0e-6 * tp.tv_usec;
}
-double measure_tick()
-{
+double measure_tick() {
ticks const rstart = getticks();
double const wstart = get_sys_time();
while (get_sys_time() - wstart < 0.1) {
@@ -83,236 +72,219 @@ double measure_tick()
}
ticks const rend = getticks();
double const wend = get_sys_time();
- assert(wend-wstart >= 0.09);
+ assert(wend - wstart >= 0.09);
return (wend - wstart) / elapsed(rend, rstart);
}
-
-
////////////////////////////////////////////////////////////////////////////////
// Initialize the grid
////////////////////////////////////////////////////////////////////////////////
-template<typename realvec_t>
-void init(typename realvec_t::real_t *restrict xptr,
- ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n)
-{
- for (ptrdiff_t j=0; j<n; ++j) {
- for (ptrdiff_t i=0; i<m; ++i) {
- const ptrdiff_t ij = ldm*j + i;
- xptr[ij] = (i+j)%2;
+template <typename realvec_t>
+void init(typename realvec_t::real_t *restrict xptr, ptrdiff_t m, ptrdiff_t ldm,
+ ptrdiff_t n) {
+ for (ptrdiff_t j = 0; j < n; ++j) {
+ for (ptrdiff_t i = 0; i < m; ++i) {
+ const ptrdiff_t ij = ldm * j + i;
+ xptr[ij] = (i + j) % 2;
}
}
}
-
-
////////////////////////////////////////////////////////////////////////////////
// Evolution loop: Simple stencil example (Gaussian smoothing)
////////////////////////////////////////////////////////////////////////////////
// Introduce a delay, so that cache access is not so important
-template<typename T>
-static T delay(const T x)
-{
+template <typename T> static T delay(const T x) {
return x;
// return log(exp(x));
}
// Original version, unvectorized
-template<typename realvec_t>
+template <typename realvec_t>
void smooth_scalar(typename realvec_t::real_t const *restrict xptr,
- typename realvec_t::real_t *restrict yptr,
- ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n)
-{
+ typename realvec_t::real_t *restrict yptr, ptrdiff_t m,
+ ptrdiff_t ldm, ptrdiff_t n) {
typedef typename realvec_t::real_t real_t;
- for (ptrdiff_t j=1; j<n-1; ++j) {
- for (ptrdiff_t i=1; i<m-1; ++i) {
- const ptrdiff_t ij = ldm*j + i;
- const real_t x = xptr[ij];
- const real_t xil = xptr[ij-1];
- const real_t xir = xptr[ij+1];
- const real_t xjl = xptr[ij-ldm];
- const real_t xjr = xptr[ij+ldm];
+ for (ptrdiff_t j = 1; j < n - 1; ++j) {
+ for (ptrdiff_t i = 1; i < m - 1; ++i) {
+ const ptrdiff_t ij = ldm * j + i;
+ const real_t x = xptr[ij];
+ const real_t xil = xptr[ij - 1];
+ const real_t xir = xptr[ij + 1];
+ const real_t xjl = xptr[ij - ldm];
+ const real_t xjr = xptr[ij + ldm];
const real_t y =
- real_t(0.5) * x + real_t(0.125) * (xil + xir + xjl + xjr);
+ real_t(0.5) * x + real_t(0.125) * (xil + xir + xjl + xjr);
yptr[ij] = delay(y);
}
}
}
-
-
// Assuming no particular alignment
-template<typename realvec_t>
+template <typename realvec_t>
void smooth_unaligned(typename realvec_t::real_t const *restrict xptr,
- typename realvec_t::real_t *restrict yptr,
- ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n)
-{
+ typename realvec_t::real_t *restrict yptr, ptrdiff_t m,
+ ptrdiff_t ldm, ptrdiff_t n) {
typedef typename realvec_t::real_t real_t;
typedef typename realvec_t::mask_t mask_t;
- for (ptrdiff_t j=1; j<n-1; ++j) {
+ for (ptrdiff_t j = 1; j < n - 1; ++j) {
// Desired loop bounds
const ptrdiff_t imin = 1;
- const ptrdiff_t imax = m-1;
+ const ptrdiff_t imax = m - 1;
// Align actual loop iterations with vector size
- const ptrdiff_t ioff = ldm*j;
+ const ptrdiff_t ioff = ldm * j;
for (mask_t mask(imin, imax, ioff); mask; ++mask) {
const ptrdiff_t i = mask.index();
const ptrdiff_t ij = ioff + i;
- const realvec_t x = realvec_t::loadu(xptr+ij);
- const realvec_t xil = realvec_t::loadu(xptr+ij, -1);
- const realvec_t xir = realvec_t::loadu(xptr+ij, +1);
- const realvec_t xjl = realvec_t::loadu(xptr+ij-ldm);
- const realvec_t xjr = realvec_t::loadu(xptr+ij+ldm);
- const realvec_t y =
- realvec_t(real_t(0.5)) * x +
- realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
- storeu(delay(y), yptr+ij, mask);
+ const realvec_t x = realvec_t::loadu(xptr + ij);
+ const realvec_t xil = realvec_t::loadu(xptr + ij, -1);
+ const realvec_t xir = realvec_t::loadu(xptr + ij, +1);
+ const realvec_t xjl = realvec_t::loadu(xptr + ij - ldm);
+ const realvec_t xjr = realvec_t::loadu(xptr + ij + ldm);
+ const realvec_t y = realvec_t(real_t(0.5)) * x +
+ realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
+ storeu(delay(y), yptr + ij, mask);
}
}
}
-
-
// Assuming that xptr and yptr are aligned, but ldm can be arbitrary
-template<typename realvec_t>
+template <typename realvec_t>
void smooth_aligned(typename realvec_t::real_t const *restrict xptr,
- typename realvec_t::real_t *restrict yptr,
- ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n)
-{
+ typename realvec_t::real_t *restrict yptr, ptrdiff_t m,
+ ptrdiff_t ldm, ptrdiff_t n) {
typedef typename realvec_t::real_t real_t;
typedef typename realvec_t::mask_t mask_t;
- for (ptrdiff_t j=1; j<n-1; ++j) {
+ for (ptrdiff_t j = 1; j < n - 1; ++j) {
// Desired loop bounds
const ptrdiff_t imin = 1;
- const ptrdiff_t imax = m-1;
+ const ptrdiff_t imax = m - 1;
// Align actual loop iterations with vector size
- const ptrdiff_t ioff = ldm*j;
+ const ptrdiff_t ioff = ldm * j;
for (mask_t mask(imin, imax, ioff); mask; ++mask) {
const ptrdiff_t i = mask.index();
const ptrdiff_t ij = ioff + i;
- const realvec_t x = realvec_t::loada(xptr+ij);
- const realvec_t xil = realvec_t::loadu(xptr+ij, -1);
- const realvec_t xir = realvec_t::loadu(xptr+ij, +1);
- const realvec_t xjl = realvec_t::loadu(xptr+ij-ldm);
- const realvec_t xjr = realvec_t::loadu(xptr+ij+ldm);
- const realvec_t y =
- realvec_t(real_t(0.5)) * x +
- realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
- storea(delay(y), yptr+ij, mask);
+ const realvec_t x = realvec_t::loada(xptr + ij);
+ const realvec_t xil = realvec_t::loadu(xptr + ij, -1);
+ const realvec_t xir = realvec_t::loadu(xptr + ij, +1);
+ const realvec_t xjl = realvec_t::loadu(xptr + ij - ldm);
+ const realvec_t xjr = realvec_t::loadu(xptr + ij + ldm);
+ const realvec_t y = realvec_t(real_t(0.5)) * x +
+ realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
+ storea(delay(y), yptr + ij, mask);
}
}
}
-
-
// Assuming that xptr and yptr are aligned, and ldm is a multiple of
// the vector size
-template<typename realvec_t>
+template <typename realvec_t>
void smooth_padded(typename realvec_t::real_t const *restrict xptr,
- typename realvec_t::real_t *restrict yptr,
- ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n)
-{
+ typename realvec_t::real_t *restrict yptr, ptrdiff_t m,
+ ptrdiff_t ldm, ptrdiff_t n) {
typedef typename realvec_t::real_t real_t;
typedef typename realvec_t::mask_t mask_t;
assert(ldm % realvec_t::size == 0);
- for (ptrdiff_t j=1; j<n-1; ++j) {
+ for (ptrdiff_t j = 1; j < n - 1; ++j) {
// Desired loop bounds
const ptrdiff_t imin = 1;
- const ptrdiff_t imax = m-1;
+ const ptrdiff_t imax = m - 1;
// Align actual loop iterations with vector size
- const ptrdiff_t ioff = ldm*j;
+ const ptrdiff_t ioff = ldm * j;
for (mask_t mask(imin, imax, ioff); mask; ++mask) {
const ptrdiff_t i = mask.index();
const ptrdiff_t ij = ioff + i;
- const realvec_t x = realvec_t::loada(xptr+ij);
- const realvec_t xil = realvec_t::loadu(xptr+ij, -1);
- const realvec_t xir = realvec_t::loadu(xptr+ij, +1);
- const realvec_t xjl = realvec_t::loada(xptr+ij-ldm);
- const realvec_t xjr = realvec_t::loada(xptr+ij+ldm);
- const realvec_t y =
- realvec_t(real_t(0.5)) * x +
- realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
- storea(delay(y), yptr+ij, mask);
+ const realvec_t x = realvec_t::loada(xptr + ij);
+ const realvec_t xil = realvec_t::loadu(xptr + ij, -1);
+ const realvec_t xir = realvec_t::loadu(xptr + ij, +1);
+ const realvec_t xjl = realvec_t::loada(xptr + ij - ldm);
+ const realvec_t xjr = realvec_t::loada(xptr + ij + ldm);
+ const realvec_t y = realvec_t(real_t(0.5)) * x +
+ realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
+ storea(delay(y), yptr + ij, mask);
}
}
}
-
-
////////////////////////////////////////////////////////////////////////////////
// Main routine
////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
// Number of iterations
const int niters = 100;
-
+
// Grid size
const ptrdiff_t m = 100;
const ptrdiff_t n = 100;
-
- // Choose a vector size
+
+// Choose a vector size
#if defined VECMATHLIB_HAVE_VEC_DOUBLE_4
- typedef realvec<double,4> realvec_t;
+ typedef realvec<double, 4> realvec_t;
#elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2
- typedef realvec<double,2> realvec_t;
+ typedef realvec<double, 2> realvec_t;
#else
- typedef realpseudovec<double,1> realvec_t;
+ typedef realpseudovec<double, 1> realvec_t;
#endif
-
+
// Ensure the grid size is aligned
const ptrdiff_t ldm = align_up(m, realvec_t::size);
typedef realvec_t::real_t real_t;
- vector<real_t> x0(ldm*n + realvec_t::size-1), y0(ldm*n + realvec_t::size-1);
- real_t* restrict const x =
- (real_t*)align_up(intptr_t(&x0[0]), sizeof(realvec_t));
- real_t* restrict const y =
- (real_t*)align_up(intptr_t(&y0[0]), sizeof(realvec_t));
- for (ptrdiff_t i=0; i<ldm*n; ++i) y[i] = 0.0;
-
+ vector<real_t> x0(ldm * n + realvec_t::size - 1),
+ y0(ldm * n + realvec_t::size - 1);
+ real_t *restrict const x =
+ (real_t *)align_up(intptr_t(&x0[0]), sizeof(realvec_t));
+ real_t *restrict const y =
+ (real_t *)align_up(intptr_t(&y0[0]), sizeof(realvec_t));
+ for (ptrdiff_t i = 0; i < ldm * n; ++i)
+ y[i] = 0.0;
+
// Initialize
init<realvec_t>(&x[0], m, ldm, n);
-
+
// Timers
ticks t0, t1;
double const cycles_per_tick = 1.0; // measure_tick();
double cycles;
-
+
// Run the different evolution loop versions
t0 = getticks();
- for (int iter=0; iter<niters; ++iter) {
+ for (int iter = 0; iter < niters; ++iter) {
smooth_scalar<realvec_t>(&x[0], &y[0], m, ldm, n);
}
t1 = getticks();
- cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters);
+ cycles =
+ cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters);
cout << "smooth_scalar: " << cycles << " cycles/point\n";
-
+
t0 = getticks();
- for (int iter=0; iter<niters; ++iter) {
+ for (int iter = 0; iter < niters; ++iter) {
smooth_unaligned<realvec_t>(&x[0], &y[0], m, ldm, n);
}
t1 = getticks();
- cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters);
+ cycles =
+ cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters);
cout << "smooth_unaligned: " << cycles << " cycles/point\n";
-
+
t0 = getticks();
- for (int iter=0; iter<niters; ++iter) {
+ for (int iter = 0; iter < niters; ++iter) {
smooth_aligned<realvec_t>(&x[0], &y[0], m, ldm, n);
}
t1 = getticks();
- cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters);
+ cycles =
+ cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters);
cout << "smooth_aligned: " << cycles << " cycles/point\n";
-
+
t0 = getticks();
- for (int iter=0; iter<niters; ++iter) {
+ for (int iter = 0; iter < niters; ++iter) {
smooth_padded<realvec_t>(&x[0], &y[0], m, ldm, n);
}
t1 = getticks();
- cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters);
+ cycles =
+ cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters);
cout << "smooth_padded: " << cycles << " cycles/point\n";
-
+
return 0;
}
diff --git a/mathfuncs.h b/mathfuncs.h
index 8d90f9a..9f042d1 100644
--- a/mathfuncs.h
+++ b/mathfuncs.h
@@ -19,4 +19,4 @@
#include "mathfuncs_sinh.h"
#include "mathfuncs_sqrt.h"
-#endif // #ifndef MATHFUNCS_H
+#endif // #ifndef MATHFUNCS_H
diff --git a/mathfuncs_asin.h b/mathfuncs_asin.h
index 3dd9c75..cd174a2 100644
--- a/mathfuncs_asin.h
+++ b/mathfuncs_asin.h
@@ -7,206 +7,181 @@
#include <cmath>
+namespace vecmathlib {
+namespace {
-namespace vecmathlib {
-
-
-
- namespace {
-
- template<typename realvec_t>
- realvec_t mulsign(realvec_t x, realvec_t y)
- {
- typedef typename realvec_t::real_t real_t;
- typedef typename realvec_t::intvec_t intvec_t;
- typedef intvec_t IV;
- typedef floatprops<real_t> FP;
-
- intvec_t value = as_int(x);
- intvec_t sign = as_int(y) & IV(FP::signbit_mask);
- return as_float(value ^ sign);
- }
-
- // Note: the order of arguments is y, x, as is convention for atan2
- template<typename realvec_t>
- realvec_t atan2k(realvec_t y, realvec_t x)
- {
- // Algorithm taken from SLEEF 2.80
-
- typedef typename realvec_t::real_t real_t;
- typedef typename realvec_t::boolvec_t boolvec_t;
- typedef realvec_t RV;
-
- realvec_t q = RV(0.0);
-
- q = ifthen(signbit(x), RV(-2.0), q);
- x = fabs(x);
-
- boolvec_t cond = y > x;
- realvec_t x0 = x;
- realvec_t y0 = y;
- x = ifthen(cond, y0, x0);
- y = ifthen(cond, -x0, y0);
- q += ifthen(cond, RV(1.0), RV(0.0));
-
- realvec_t s = y / x;
- realvec_t t = s * s;
-
- realvec_t u;
- switch (sizeof(real_t)) {
- default: __builtin_unreachable();
- case sizeof(float):
- u = RV(0.00282363896258175373077393f);
- u = mad(u, t, RV(-0.0159569028764963150024414f));
- u = mad(u, t, RV(0.0425049886107444763183594f));
- u = mad(u, t, RV(-0.0748900920152664184570312f));
- u = mad(u, t, RV(0.106347933411598205566406f));
- u = mad(u, t, RV(-0.142027363181114196777344f));
- u = mad(u, t, RV(0.199926957488059997558594f));
- u = mad(u, t, RV(-0.333331018686294555664062f));
- break;
- case sizeof(double):
- u = RV(-1.88796008463073496563746e-05);
- u = mad(u, t, RV(0.000209850076645816976906797));
- u = mad(u, t, RV(-0.00110611831486672482563471));
- u = mad(u, t, RV(0.00370026744188713119232403));
- u = mad(u, t, RV(-0.00889896195887655491740809));
- u = mad(u, t, RV(0.016599329773529201970117));
- u = mad(u, t, RV(-0.0254517624932312641616861));
- u = mad(u, t, RV(0.0337852580001353069993897));
- u = mad(u, t, RV(-0.0407629191276836500001934));
- u = mad(u, t, RV(0.0466667150077840625632675));
- u = mad(u, t, RV(-0.0523674852303482457616113));
- u = mad(u, t, RV(0.0587666392926673580854313));
- u = mad(u, t, RV(-0.0666573579361080525984562));
- u = mad(u, t, RV(0.0769219538311769618355029));
- u = mad(u, t, RV(-0.090908995008245008229153));
- u = mad(u, t, RV(0.111111105648261418443745));
- u = mad(u, t, RV(-0.14285714266771329383765));
- u = mad(u, t, RV(0.199999999996591265594148));
- u = mad(u, t, RV(-0.333333333333311110369124));
- break;
- }
-
- t = mad(u, t * s, s);
- t = mad(q, RV(M_PI_2), t);
-
- return t;
- }
-
- }
-
+template <typename realvec_t> realvec_t mulsign(realvec_t x, realvec_t y) {
+ typedef typename realvec_t::real_t real_t;
+ typedef typename realvec_t::intvec_t intvec_t;
+ typedef intvec_t IV;
+ typedef floatprops<real_t> FP;
+ intvec_t value = as_int(x);
+ intvec_t sign = as_int(y) & IV(FP::signbit_mask);
+ return as_float(value ^ sign);
+}
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_asin(realvec_t d)
- {
- // Algorithm taken from SLEEF 2.80
- return mulsign(atan2k(fabs(d), sqrt((RV(1.0)+d)*(RV(1.0)-d))), d);
- }
-
-
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_acos(realvec_t d)
- {
- // Algorithm taken from SLEEF 2.80
- return (mulsign(atan2k(sqrt((RV(1.0)+d)*(RV(1.0)-d)), fabs(d)), d) +
- ifthen(d < RV(0.0), RV(M_PI), RV(0.0)));
- }
-
-
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_atan(realvec_t s)
- {
- // Algorithm taken from SLEEF 2.80
-
- realvec_t q1 = s;
- s = fabs(s);
-
- boolvec_t q0 = s > RV(1.0);
- s = ifthen(q0, rcp(s), s);
-
- realvec_t t = s * s;
-
- realvec_t u;
- switch (sizeof(real_t)) {
- default: __builtin_unreachable();
- case sizeof(float):
- u = RV(0.00282363896258175373077393f);
- u = mad(u, t, RV(-0.0159569028764963150024414f));
- u = mad(u, t, RV(0.0425049886107444763183594f));
- u = mad(u, t, RV(-0.0748900920152664184570312f));
- u = mad(u, t, RV(0.106347933411598205566406f));
- u = mad(u, t, RV(-0.142027363181114196777344f));
- u = mad(u, t, RV(0.199926957488059997558594f));
- u = mad(u, t, RV(-0.333331018686294555664062f));
- break;
- case sizeof(double):
- u = RV(-1.88796008463073496563746e-05);
- u = mad(u, t, RV(0.000209850076645816976906797));
- u = mad(u, t, RV(-0.00110611831486672482563471));
- u = mad(u, t, RV(0.00370026744188713119232403));
- u = mad(u, t, RV(-0.00889896195887655491740809));
- u = mad(u, t, RV(0.016599329773529201970117));
- u = mad(u, t, RV(-0.0254517624932312641616861));
- u = mad(u, t, RV(0.0337852580001353069993897));
- u = mad(u, t, RV(-0.0407629191276836500001934));
- u = mad(u, t, RV(0.0466667150077840625632675));
- u = mad(u, t, RV(-0.0523674852303482457616113));
- u = mad(u, t, RV(0.0587666392926673580854313));
- u = mad(u, t, RV(-0.0666573579361080525984562));
- u = mad(u, t, RV(0.0769219538311769618355029));
- u = mad(u, t, RV(-0.090908995008245008229153));
- u = mad(u, t, RV(0.111111105648261418443745));
- u = mad(u, t, RV(-0.14285714266771329383765));
- u = mad(u, t, RV(0.199999999996591265594148));
- u = mad(u, t, RV(-0.333333333333311110369124));
- break;
- }
-
- t = s + s * (t * u);
-
- t = ifthen(q0, RV(M_PI_2) - t, t);
- t = copysign(t, q1);
-
- return t;
+// Note: the order of arguments is y, x, as is convention for atan2
+template <typename realvec_t> realvec_t atan2k(realvec_t y, realvec_t x) {
+ // Algorithm taken from SLEEF 2.80
+
+ typedef typename realvec_t::real_t real_t;
+ typedef typename realvec_t::boolvec_t boolvec_t;
+ typedef realvec_t RV;
+
+ realvec_t q = RV(0.0);
+
+ q = ifthen(signbit(x), RV(-2.0), q);
+ x = fabs(x);
+
+ boolvec_t cond = y > x;
+ realvec_t x0 = x;
+ realvec_t y0 = y;
+ x = ifthen(cond, y0, x0);
+ y = ifthen(cond, -x0, y0);
+ q += ifthen(cond, RV(1.0), RV(0.0));
+
+ realvec_t s = y / x;
+ realvec_t t = s * s;
+
+ realvec_t u;
+ switch (sizeof(real_t)) {
+ default:
+ __builtin_unreachable();
+ case sizeof(float):
+ u = RV(0.00282363896258175373077393f);
+ u = mad(u, t, RV(-0.0159569028764963150024414f));
+ u = mad(u, t, RV(0.0425049886107444763183594f));
+ u = mad(u, t, RV(-0.0748900920152664184570312f));
+ u = mad(u, t, RV(0.106347933411598205566406f));
+ u = mad(u, t, RV(-0.142027363181114196777344f));
+ u = mad(u, t, RV(0.199926957488059997558594f));
+ u = mad(u, t, RV(-0.333331018686294555664062f));
+ break;
+ case sizeof(double):
+ u = RV(-1.88796008463073496563746e-05);
+ u = mad(u, t, RV(0.000209850076645816976906797));
+ u = mad(u, t, RV(-0.00110611831486672482563471));
+ u = mad(u, t, RV(0.00370026744188713119232403));
+ u = mad(u, t, RV(-0.00889896195887655491740809));
+ u = mad(u, t, RV(0.016599329773529201970117));
+ u = mad(u, t, RV(-0.0254517624932312641616861));
+ u = mad(u, t, RV(0.0337852580001353069993897));
+ u = mad(u, t, RV(-0.0407629191276836500001934));
+ u = mad(u, t, RV(0.0466667150077840625632675));
+ u = mad(u, t, RV(-0.0523674852303482457616113));
+ u = mad(u, t, RV(0.0587666392926673580854313));
+ u = mad(u, t, RV(-0.0666573579361080525984562));
+ u = mad(u, t, RV(0.0769219538311769618355029));
+ u = mad(u, t, RV(-0.090908995008245008229153));
+ u = mad(u, t, RV(0.111111105648261418443745));
+ u = mad(u, t, RV(-0.14285714266771329383765));
+ u = mad(u, t, RV(0.199999999996591265594148));
+ u = mad(u, t, RV(-0.333333333333311110369124));
+ break;
}
-
-
- // Note: the order of arguments is y, x, as is convention for atan2
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_atan2(realvec_t y, realvec_t x)
- {
- // Algorithm taken from SLEEF 2.80
-
- realvec_t r = atan2k(fabs(y), x);
-
- r = mulsign(r, x);
-
- r = ifthen(isinf(x) || x == RV(0.0),
- ifthen(isinf(x),
- RV(M_PI_2) - copysign(RV(M_PI_2), x),
- RV(M_PI_2)),
- r);
-
- r = ifthen(isinf(y),
- ifthen(isinf(x),
- RV(M_PI_2) - copysign(RV(M_PI_4), x),
- RV(M_PI_2)),
- r);
-
- r = ifthen(y == RV(0.0),
- ifthen(signbit(x), RV(M_PI), RV(0.0)),
- r);
-
- const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
- return ifthen(isnan(x) || isnan(y), RV(nan), mulsign(r, y));
+ t = mad(u, t * s, s);
+ t = mad(q, RV(M_PI_2), t);
+
+ return t;
+}
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_asin(realvec_t d) {
+ // Algorithm taken from SLEEF 2.80
+ return mulsign(atan2k(fabs(d), sqrt((RV(1.0) + d) * (RV(1.0) - d))), d);
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_acos(realvec_t d) {
+ // Algorithm taken from SLEEF 2.80
+ return (mulsign(atan2k(sqrt((RV(1.0) + d) * (RV(1.0) - d)), fabs(d)), d) +
+ ifthen(d < RV(0.0), RV(M_PI), RV(0.0)));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_atan(realvec_t s) {
+ // Algorithm taken from SLEEF 2.80
+
+ realvec_t q1 = s;
+ s = fabs(s);
+
+ boolvec_t q0 = s > RV(1.0);
+ s = ifthen(q0, rcp(s), s);
+
+ realvec_t t = s * s;
+
+ realvec_t u;
+ switch (sizeof(real_t)) {
+ default:
+ __builtin_unreachable();
+ case sizeof(float):
+ u = RV(0.00282363896258175373077393f);
+ u = mad(u, t, RV(-0.0159569028764963150024414f));
+ u = mad(u, t, RV(0.0425049886107444763183594f));
+ u = mad(u, t, RV(-0.0748900920152664184570312f));
+ u = mad(u, t, RV(0.106347933411598205566406f));
+ u = mad(u, t, RV(-0.142027363181114196777344f));
+ u = mad(u, t, RV(0.199926957488059997558594f));
+ u = mad(u, t, RV(-0.333331018686294555664062f));
+ break;
+ case sizeof(double):
+ u = RV(-1.88796008463073496563746e-05);
+ u = mad(u, t, RV(0.000209850076645816976906797));
+ u = mad(u, t, RV(-0.00110611831486672482563471));
+ u = mad(u, t, RV(0.00370026744188713119232403));
+ u = mad(u, t, RV(-0.00889896195887655491740809));
+ u = mad(u, t, RV(0.016599329773529201970117));
+ u = mad(u, t, RV(-0.0254517624932312641616861));
+ u = mad(u, t, RV(0.0337852580001353069993897));
+ u = mad(u, t, RV(-0.0407629191276836500001934));
+ u = mad(u, t, RV(0.0466667150077840625632675));
+ u = mad(u, t, RV(-0.0523674852303482457616113));
+ u = mad(u, t, RV(0.0587666392926673580854313));
+ u = mad(u, t, RV(-0.0666573579361080525984562));
+ u = mad(u, t, RV(0.0769219538311769618355029));
+ u = mad(u, t, RV(-0.090908995008245008229153));
+ u = mad(u, t, RV(0.111111105648261418443745));
+ u = mad(u, t, RV(-0.14285714266771329383765));
+ u = mad(u, t, RV(0.199999999996591265594148));
+ u = mad(u, t, RV(-0.333333333333311110369124));
+ break;
}
-
+
+ t = s + s * (t * u);
+
+ t = ifthen(q0, RV(M_PI_2) - t, t);
+ t = copysign(t, q1);
+
+ return t;
+}
+
+// Note: the order of arguments is y, x, as is convention for atan2
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_atan2(realvec_t y, realvec_t x) {
+ // Algorithm taken from SLEEF 2.80
+
+ realvec_t r = atan2k(fabs(y), x);
+
+ r = mulsign(r, x);
+
+ r = ifthen(isinf(x) || x == RV(0.0),
+ ifthen(isinf(x), RV(M_PI_2) - copysign(RV(M_PI_2), x), RV(M_PI_2)),
+ r);
+
+ r = ifthen(isinf(y),
+ ifthen(isinf(x), RV(M_PI_2) - copysign(RV(M_PI_4), x), RV(M_PI_2)),
+ r);
+
+ r = ifthen(y == RV(0.0), ifthen(signbit(x), RV(M_PI), RV(0.0)), r);
+
+ const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
+ return ifthen(isnan(x) || isnan(y), RV(nan), mulsign(r, y));
+}
+
}; // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_ASIN_H
+#endif // #ifndef MATHFUNCS_ASIN_H
diff --git a/mathfuncs_asinh.h b/mathfuncs_asinh.h
index c7be8eb..1197261 100644
--- a/mathfuncs_asinh.h
+++ b/mathfuncs_asinh.h
@@ -7,36 +7,31 @@
#include <cmath>
+namespace vecmathlib {
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_asinh(realvec_t x) {
+ // Reduce range
+ realvec_t r = fabs(x);
+ r = log(r + sqrt(r * r + RV(1.0)));
+ r = copysign(r, x);
+ return r;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_acosh(realvec_t x) {
+ return log(x + sqrt(x * x - RV(1.0)));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_atanh(realvec_t x) {
+ // Reduce range
+ realvec_t r = fabs(x);
+ r = RV(0.5) * log((RV(1.0) + r) / (RV(1.0) - r));
+ r = copysign(r, x);
+ return r;
+}
-namespace vecmathlib {
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_asinh(realvec_t x)
- {
- // Reduce range
- realvec_t r = fabs(x);
- r = log(r + sqrt(r*r + RV(1.0)));
- r = copysign(r, x);
- return r;
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_acosh(realvec_t x)
- {
- return log(x + sqrt(x*x - RV(1.0)));
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_atanh(realvec_t x)
- {
- // Reduce range
- realvec_t r = fabs(x);
- r = RV(0.5) * log((RV(1.0) + r) / (RV(1.0) - r));
- r = copysign(r, x);
- return r;
- }
-
}; // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_ASINH_H
+#endif // #ifndef MATHFUNCS_ASINH_H
diff --git a/mathfuncs_base.h b/mathfuncs_base.h
index c685542..8545003 100644
--- a/mathfuncs_base.h
+++ b/mathfuncs_base.h
@@ -5,130 +5,127 @@
#include "floatprops.h"
+namespace vecmathlib {
+template <typename realvec_t> struct mathfuncs {
+ typedef floatprops<typename realvec_t::real_t> FP;
+
+ typedef typename FP::real_t real_t;
+ typedef typename FP::int_t int_t;
+ typedef typename FP::uint_t uint_t;
+
+ static int const size = realvec_t::size;
+
+ // typedef realvec<real_t, size> realvec_t;
+ typedef typename realvec_t::intvec_t intvec_t;
+ typedef typename realvec_t::boolvec_t boolvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ // static real_t R(double a) { return real_t(a); }
+ // static int_t I(int a) { return int_t(a); }
+ // static uint_t U(int a) { return uint_t(a); }
+ // static realvec_t RV(real_t a) { return realvec_t(a); }
+ // static intvec_t IV(int_t a) { return intvec_t(a); }
+ // static boolvec_t BV(bool a) { return boolvec_t(a); }
+
+ // int
+ static intvec_t vml_abs(intvec_t x);
+ static intvec_t vml_bitifthen(intvec_t x, intvec_t y, intvec_t z);
+ static intvec_t vml_clz(intvec_t x);
+ static boolvec_t vml_isignbit(intvec_t x);
+ static intvec_t vml_max(intvec_t x, intvec_t y);
+ static intvec_t vml_min(intvec_t x, intvec_t y);
+ static intvec_t vml_popcount(intvec_t x);
+ static intvec_t vml_rotate(intvec_t x, int_t n);
+ static intvec_t vml_rotate(intvec_t x, intvec_t n);
+
+ // asin
+ static realvec_t vml_acos(realvec_t x);
+ static realvec_t vml_asin(realvec_t x);
+ static realvec_t vml_atan(realvec_t x);
+ static realvec_t vml_atan2(realvec_t y, realvec_t x);
+
+ // asinh
+ static realvec_t vml_acosh(realvec_t x);
+ static realvec_t vml_asinh(realvec_t x);
+ static realvec_t vml_atanh(realvec_t x);
+
+ // convert
+ static realvec_t vml_antitrunc(realvec_t x);
+ static realvec_t vml_ceil(realvec_t x);
+ static realvec_t vml_convert_float(intvec_t x);
+ static intvec_t vml_convert_int(realvec_t x);
+ static realvec_t vml_floor(realvec_t x);
+ static intvec_t vml_lrint(realvec_t x);
+ static realvec_t vml_rint(realvec_t x);
+ static realvec_t vml_round(realvec_t x);
+ static realvec_t vml_nextafter(realvec_t x, realvec_t y);
+ static realvec_t vml_trunc(realvec_t x);
+
+ // fabs
+ static realvec_t vml_copysign(realvec_t x, realvec_t y);
+ static realvec_t vml_fabs(realvec_t x);
+ static realvec_t vml_fdim(realvec_t x, realvec_t y);
+ static realvec_t vml_fma(realvec_t x, realvec_t y, realvec_t z);
+ static realvec_t vml_fmax(realvec_t x, realvec_t y);
+ static realvec_t vml_fmin(realvec_t x, realvec_t y);
+ static realvec_t vml_frexp(realvec_t x, intvec_t *r);
+ static intvec_t vml_ilogb(realvec_t x);
+ static boolvec_t vml_ieee_isfinite(realvec_t x);
+ static boolvec_t vml_ieee_isinf(realvec_t x);
+ static boolvec_t vml_ieee_isnan(realvec_t x);
+ static boolvec_t vml_ieee_isnormal(realvec_t x);
+ static boolvec_t vml_isfinite(realvec_t x);
+ static boolvec_t vml_isinf(realvec_t x);
+ static boolvec_t vml_isnan(realvec_t x);
+ static boolvec_t vml_isnormal(realvec_t x);
+ static realvec_t vml_ldexp(realvec_t x, intvec_t n);
+ static realvec_t vml_mad(realvec_t x, realvec_t y, realvec_t z);
+ static boolvec_t vml_signbit(realvec_t x);
+
+ // exp
+ static realvec_t vml_exp(realvec_t x);
+ static realvec_t vml_exp10(realvec_t x);
+ static realvec_t vml_exp2(realvec_t x);
+ static realvec_t vml_expm1(realvec_t x);
+
+ // log
+ static realvec_t vml_log(realvec_t x);
+ static realvec_t vml_log10(realvec_t x);
+ static realvec_t vml_log1p(realvec_t x);
+ static realvec_t vml_log2(realvec_t x);
+
+ // pow
+ static realvec_t vml_pow(realvec_t x, realvec_t y);
+
+ // rcp
+ static realvec_t vml_fmod(realvec_t x, realvec_t y);
+ static realvec_t vml_rcp(realvec_t x);
+ static realvec_t vml_remainder(realvec_t x, realvec_t y);
+
+ // sin
+ static realvec_t vml_cos(realvec_t x);
+ static realvec_t vml_sin(realvec_t x);
+ static realvec_t vml_tan(realvec_t x);
+
+ // sinh
+ static realvec_t vml_cosh(realvec_t x);
+ static realvec_t vml_sinh(realvec_t x);
+ static realvec_t vml_tanh(realvec_t x);
+
+ // sqrt
+ static realvec_t vml_cbrt(realvec_t x);
+ static realvec_t vml_hypot(realvec_t x, realvec_t y);
+ static realvec_t vml_rsqrt(realvec_t x);
+ static realvec_t vml_sqrt(realvec_t x);
+};
-namespace vecmathlib {
-
- template<typename realvec_t>
- struct mathfuncs {
- typedef floatprops<typename realvec_t::real_t> FP;
-
- typedef typename FP::real_t real_t;
- typedef typename FP::int_t int_t;
- typedef typename FP::uint_t uint_t;
-
- static int const size = realvec_t::size;
-
- // typedef realvec<real_t, size> realvec_t;
- typedef typename realvec_t::intvec_t intvec_t;
- typedef typename realvec_t::boolvec_t boolvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- // static real_t R(double a) { return real_t(a); }
- // static int_t I(int a) { return int_t(a); }
- // static uint_t U(int a) { return uint_t(a); }
- // static realvec_t RV(real_t a) { return realvec_t(a); }
- // static intvec_t IV(int_t a) { return intvec_t(a); }
- // static boolvec_t BV(bool a) { return boolvec_t(a); }
-
- // int
- static intvec_t vml_abs(intvec_t x);
- static intvec_t vml_bitifthen(intvec_t x, intvec_t y, intvec_t z);
- static intvec_t vml_clz(intvec_t x);
- static boolvec_t vml_isignbit(intvec_t x);
- static intvec_t vml_max(intvec_t x, intvec_t y);
- static intvec_t vml_min(intvec_t x, intvec_t y);
- static intvec_t vml_popcount(intvec_t x);
- static intvec_t vml_rotate(intvec_t x, int_t n);
- static intvec_t vml_rotate(intvec_t x, intvec_t n);
-
- // asin
- static realvec_t vml_acos(realvec_t x);
- static realvec_t vml_asin(realvec_t x);
- static realvec_t vml_atan(realvec_t x);
- static realvec_t vml_atan2(realvec_t y, realvec_t x);
-
- // asinh
- static realvec_t vml_acosh(realvec_t x);
- static realvec_t vml_asinh(realvec_t x);
- static realvec_t vml_atanh(realvec_t x);
-
- // convert
- static realvec_t vml_antitrunc(realvec_t x);
- static realvec_t vml_ceil(realvec_t x);
- static realvec_t vml_convert_float(intvec_t x);
- static intvec_t vml_convert_int(realvec_t x);
- static realvec_t vml_floor(realvec_t x);
- static intvec_t vml_lrint(realvec_t x);
- static realvec_t vml_rint(realvec_t x);
- static realvec_t vml_round(realvec_t x);
- static realvec_t vml_nextafter(realvec_t x, realvec_t y);
- static realvec_t vml_trunc(realvec_t x);
-
- // fabs
- static realvec_t vml_copysign(realvec_t x, realvec_t y);
- static realvec_t vml_fabs(realvec_t x);
- static realvec_t vml_fdim(realvec_t x, realvec_t y);
- static realvec_t vml_fma(realvec_t x, realvec_t y, realvec_t z);
- static realvec_t vml_fmax(realvec_t x, realvec_t y);
- static realvec_t vml_fmin(realvec_t x, realvec_t y);
- static realvec_t vml_frexp(realvec_t x, intvec_t* r);
- static intvec_t vml_ilogb(realvec_t x);
- static boolvec_t vml_ieee_isfinite(realvec_t x);
- static boolvec_t vml_ieee_isinf(realvec_t x);
- static boolvec_t vml_ieee_isnan(realvec_t x);
- static boolvec_t vml_ieee_isnormal(realvec_t x);
- static boolvec_t vml_isfinite(realvec_t x);
- static boolvec_t vml_isinf(realvec_t x);
- static boolvec_t vml_isnan(realvec_t x);
- static boolvec_t vml_isnormal(realvec_t x);
- static realvec_t vml_ldexp(realvec_t x, intvec_t n);
- static realvec_t vml_mad(realvec_t x, realvec_t y, realvec_t z);
- static boolvec_t vml_signbit(realvec_t x);
-
- // exp
- static realvec_t vml_exp(realvec_t x);
- static realvec_t vml_exp10(realvec_t x);
- static realvec_t vml_exp2(realvec_t x);
- static realvec_t vml_expm1(realvec_t x);
-
- // log
- static realvec_t vml_log(realvec_t x);
- static realvec_t vml_log10(realvec_t x);
- static realvec_t vml_log1p(realvec_t x);
- static realvec_t vml_log2(realvec_t x);
-
- // pow
- static realvec_t vml_pow(realvec_t x, realvec_t y);
-
- // rcp
- static realvec_t vml_fmod(realvec_t x, realvec_t y);
- static realvec_t vml_rcp(realvec_t x);
- static realvec_t vml_remainder(realvec_t x, realvec_t y);
-
- // sin
- static realvec_t vml_cos(realvec_t x);
- static realvec_t vml_sin(realvec_t x);
- static realvec_t vml_tan(realvec_t x);
-
- // sinh
- static realvec_t vml_cosh(realvec_t x);
- static realvec_t vml_sinh(realvec_t x);
- static realvec_t vml_tanh(realvec_t x);
-
- // sqrt
- static realvec_t vml_cbrt(realvec_t x);
- static realvec_t vml_hypot(realvec_t x, realvec_t y);
- static realvec_t vml_rsqrt(realvec_t x);
- static realvec_t vml_sqrt(realvec_t x);
- };
-
} // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_BASE_H
+#endif // #ifndef MATHFUNCS_BASE_H
diff --git a/mathfuncs_convert.h b/mathfuncs_convert.h
index 79befbc..9cb1add 100644
--- a/mathfuncs_convert.h
+++ b/mathfuncs_convert.h
@@ -7,197 +7,179 @@
#include <cmath>
+namespace vecmathlib {
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_convert_float(intvec_t x) {
+ // Convert in two passes. Convert as much as possible during the
+ // first pass (lobits), so that the second pass (hibits) may be
+ // omitted if the high bits are known to be zero.
+ int_t lobits = FP::mantissa_bits;
+ // int_t hibits = FP::bits - lobits;
-namespace vecmathlib {
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_convert_float(intvec_t x)
- {
- // Convert in two passes. Convert as much as possible during the
- // first pass (lobits), so that the second pass (hibits) may be
- // omitted if the high bits are known to be zero.
- int_t lobits = FP::mantissa_bits;
- // int_t hibits = FP::bits - lobits;
-
- // Convert lower bits
- intvec_t xlo = x & IV((U(1) << lobits) - 1);
- // exponent for the equivalent floating point number
- int_t exponent_lo = (FP::exponent_offset + lobits) << FP::mantissa_bits;
- xlo |= exponent_lo;
- // subtract hidden mantissa bit
- realvec_t flo = as_float(xlo) - RV(FP::as_float(exponent_lo));
-
- // Convert upper bits
- // make unsigned by subtracting largest negative number
- // (only do this for the high bits, since they have sufficient
- // precision to handle the overflow)
- x ^= FP::signbit_mask;
- intvec_t xhi = lsr(x, lobits);
- // exponent for the equivalent floating point number
- int_t exponent_hi = (FP::exponent_offset + 2*lobits) << FP::mantissa_bits;
- xhi |= exponent_hi;
- // subtract hidden mantissa bit
- realvec_t fhi = as_float(xhi) - RV(FP::as_float(exponent_hi));
- // add largest negative number again
- fhi -= RV(R(FP::signbit_mask));
- // Ensure that the converted low and high bits are calculated
- // separately, since a real_t doesn't have enough precision to
- // hold all the bits of an int_t
- fhi.barrier();
-
- // Combine results
- return flo + fhi;
- }
-
-
-
- template<typename realvec_t>
- typename realvec_t::intvec_t
- mathfuncs<realvec_t>::vml_convert_int(realvec_t x)
- {
- // Handle overflow
- // int_t min_int = FP::signbit_mask;
- // int_t max_int = ~FP::signbit_mask;
- // boolvec_t is_overflow = x < RV(R(min_int)) || x > RV(R(max_int));
- // Handle negative numbers
- boolvec_t is_negative = signbit(x);
- x = fabs(x);
- // Handle small numbers
- boolvec_t issmall = x < RV(1.0);
-
- intvec_t shift = ilogb(x) - IV(FP::mantissa_bits);
- boolvec_t shift_left = x > RV(std::ldexp(R(1.0), FP::mantissa_bits));
- intvec_t ix = as_int(x) & IV(FP::mantissa_mask);
- // add hidden mantissa bit
- ix |= U(1) << FP::mantissa_bits;
- // shift according to exponent (which may truncate)
- ix = ifthen(shift_left, ix << shift, ix >> -shift);
-
- // Handle small numbers
- ix = ifthen(issmall, IV(I(0)), ix);
- // Handle negative numbers
- ix = ifthen(is_negative, -ix, ix);
- // Handle overflow
- // ix = ifthen(is_overflow, IV(min_int), ix);
-
- return ix;
- }
-
-
-
- // Round to nearest integer, breaking ties using prevailing rounding
- // mode (default: round to even)
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_rint(realvec_t x)
- {
- realvec_t r = x;
- // Round by adding a large number, destroying all excess precision
- realvec_t offset = copysign(RV(std::ldexp(R(1.0), FP::mantissa_bits)), x);
- r += offset;
- // Ensure the rounding is not optimised away
- r.barrier();
- r -= offset;
- return r;
- }
-
- // Round to next integer above
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_ceil(realvec_t x)
- {
- // boolvec_t iszero = x == RV(0.0);
- // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits));
- // return ifthen(iszero, x, rint(x + offset));
- return ifthen(x<RV(0.0), trunc(x), vml_antitrunc(x));
- }
-
- // Round to next integer below
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_floor(realvec_t x)
- {
- // boolvec_t iszero = x == RV(0.0);
- // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits));
- // return ifthen(iszero, x, rint(x - offset));
- return ifthen(x<RV(0.0), vml_antitrunc(x), trunc(x));
- }
-
- // Round to nearest integer, breaking ties using prevailing rounding
- // mode (default: round to even), returning an integer
- template<typename realvec_t>
- typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_lrint(realvec_t x)
- {
- return convert_int(rint(x));
- }
-
- // Round to nearest integer, breaking ties away from zero
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_round(realvec_t x)
- {
- // return copysign(floor(fabs(x)+RV(0.5)), x);
- return trunc(x + copysign(RV(0.5), x));
- }
-
- // Round to next integer towards zero
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_trunc(realvec_t x)
- {
- realvec_t x0 = x;
- x = fabs(x);
- boolvec_t istoosmall = x < RV(1.0);
- boolvec_t istoolarge = x >= RV(std::ldexp(R(1.0), FP::mantissa_bits));
- // Number of mantissa bits to keep
- intvec_t nbits = ilogb(x);
- // This is probably faster than a shift operation
- realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0);
- intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask);
- realvec_t y = as_float(as_int(x) & imask);
- realvec_t r =
+ // Convert lower bits
+ intvec_t xlo = x & IV((U(1) << lobits) - 1);
+ // exponent for the equivalent floating point number
+ int_t exponent_lo = (FP::exponent_offset + lobits) << FP::mantissa_bits;
+ xlo |= exponent_lo;
+ // subtract hidden mantissa bit
+ realvec_t flo = as_float(xlo) - RV(FP::as_float(exponent_lo));
+
+ // Convert upper bits
+ // make unsigned by subtracting largest negative number
+ // (only do this for the high bits, since they have sufficient
+ // precision to handle the overflow)
+ x ^= FP::signbit_mask;
+ intvec_t xhi = lsr(x, lobits);
+ // exponent for the equivalent floating point number
+ int_t exponent_hi = (FP::exponent_offset + 2 * lobits) << FP::mantissa_bits;
+ xhi |= exponent_hi;
+ // subtract hidden mantissa bit
+ realvec_t fhi = as_float(xhi) - RV(FP::as_float(exponent_hi));
+ // add largest negative number again
+ fhi -= RV(R(FP::signbit_mask));
+ // Ensure that the converted low and high bits are calculated
+ // separately, since a real_t doesn't have enough precision to
+ // hold all the bits of an int_t
+ fhi.barrier();
+
+ // Combine results
+ return flo + fhi;
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t
+mathfuncs<realvec_t>::vml_convert_int(realvec_t x) {
+ // Handle overflow
+ // int_t min_int = FP::signbit_mask;
+ // int_t max_int = ~FP::signbit_mask;
+ // boolvec_t is_overflow = x < RV(R(min_int)) || x > RV(R(max_int));
+ // Handle negative numbers
+ boolvec_t is_negative = signbit(x);
+ x = fabs(x);
+ // Handle small numbers
+ boolvec_t issmall = x < RV(1.0);
+
+ intvec_t shift = ilogb(x) - IV(FP::mantissa_bits);
+ boolvec_t shift_left = x > RV(std::ldexp(R(1.0), FP::mantissa_bits));
+ intvec_t ix = as_int(x) & IV(FP::mantissa_mask);
+ // add hidden mantissa bit
+ ix |= U(1) << FP::mantissa_bits;
+ // shift according to exponent (which may truncate)
+ ix = ifthen(shift_left, ix << shift, ix >> -shift);
+
+ // Handle small numbers
+ ix = ifthen(issmall, IV(I(0)), ix);
+ // Handle negative numbers
+ ix = ifthen(is_negative, -ix, ix);
+ // Handle overflow
+ // ix = ifthen(is_overflow, IV(min_int), ix);
+
+ return ix;
+}
+
+// Round to nearest integer, breaking ties using prevailing rounding
+// mode (default: round to even)
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_rint(realvec_t x) {
+ realvec_t r = x;
+ // Round by adding a large number, destroying all excess precision
+ realvec_t offset = copysign(RV(std::ldexp(R(1.0), FP::mantissa_bits)), x);
+ r += offset;
+ // Ensure the rounding is not optimised away
+ r.barrier();
+ r -= offset;
+ return r;
+}
+
+// Round to next integer above
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_ceil(realvec_t x) {
+ // boolvec_t iszero = x == RV(0.0);
+ // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits));
+ // return ifthen(iszero, x, rint(x + offset));
+ return ifthen(x < RV(0.0), trunc(x), vml_antitrunc(x));
+}
+
+// Round to next integer below
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_floor(realvec_t x) {
+ // boolvec_t iszero = x == RV(0.0);
+ // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits));
+ // return ifthen(iszero, x, rint(x - offset));
+ return ifthen(x < RV(0.0), vml_antitrunc(x), trunc(x));
+}
+
+// Round to nearest integer, breaking ties using prevailing rounding
+// mode (default: round to even), returning an integer
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_lrint(realvec_t x) {
+ return convert_int(rint(x));
+}
+
+// Round to nearest integer, breaking ties away from zero
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_round(realvec_t x) {
+ // return copysign(floor(fabs(x)+RV(0.5)), x);
+ return trunc(x + copysign(RV(0.5), x));
+}
+
+// Round to next integer towards zero
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_trunc(realvec_t x) {
+ realvec_t x0 = x;
+ x = fabs(x);
+ boolvec_t istoosmall = x < RV(1.0);
+ boolvec_t istoolarge = x >= RV(std::ldexp(R(1.0), FP::mantissa_bits));
+ // Number of mantissa bits to keep
+ intvec_t nbits = ilogb(x);
+ // This is probably faster than a shift operation
+ realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0);
+ intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask);
+ realvec_t y = as_float(as_int(x) & imask);
+ realvec_t r =
copysign(ifthen(istoosmall, RV(0.0), ifthen(istoolarge, x, y)), x0);
- return r;
- }
-
- // Round to next integer away from zero
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_antitrunc(realvec_t x)
- {
- realvec_t x0 = x;
- x = fabs(x);
- boolvec_t iszero = x == RV(0.0);
- boolvec_t issmall = x <= RV(1.0);
- boolvec_t istoolarge =
- x > RV(std::ldexp(R(1.0), FP::mantissa_bits) - R(1.0));
- // Number of mantissa bits to keep
- intvec_t nbits = ilogb(x);
- // This is probably faster than a shift operation
- realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0);
- intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask);
- realvec_t offset = RV(1.0) - ldexp(RV(1.0), nbits - IV(FP::mantissa_bits));
- offset.barrier();
- realvec_t y = as_float(as_int(x + offset) & imask);
- realvec_t r =
+ return r;
+}
+
+// Round to next integer away from zero
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_antitrunc(realvec_t x) {
+ realvec_t x0 = x;
+ x = fabs(x);
+ boolvec_t iszero = x == RV(0.0);
+ boolvec_t issmall = x <= RV(1.0);
+ boolvec_t istoolarge = x > RV(std::ldexp(R(1.0), FP::mantissa_bits) - R(1.0));
+ // Number of mantissa bits to keep
+ intvec_t nbits = ilogb(x);
+ // This is probably faster than a shift operation
+ realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0);
+ intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask);
+ realvec_t offset = RV(1.0) - ldexp(RV(1.0), nbits - IV(FP::mantissa_bits));
+ offset.barrier();
+ realvec_t y = as_float(as_int(x + offset) & imask);
+ realvec_t r =
copysign(ifthen(iszero, RV(0.0),
- ifthen(issmall, RV(1.0),
- ifthen(istoolarge, x, y))), x0);
- return r;
- }
-
- // Next machine representable number from x in direction y
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_nextafter(realvec_t x, realvec_t y)
- {
- realvec_t dir = y - x;
- realvec_t offset = ldexp(RV(FP::epsilon()), ilogb(x));
- offset = copysign(offset, dir);
- offset = ifthen(convert_bool(as_int(x) & IV(FP::mantissa_mask)) ||
- signbit(x) == signbit(offset),
- offset,
- offset * RV(0.5));
- realvec_t r = x + offset;
- real_t smallest_pos = std::ldexp(FP::min(), -FP::mantissa_bits);
- return ifthen(dir==RV(0.0), y,
- ifthen(x==RV(0.0), copysign(RV(smallest_pos), dir), r));
- }
-
+ ifthen(issmall, RV(1.0), ifthen(istoolarge, x, y))),
+ x0);
+ return r;
+}
+
+// Next machine representable number from x in direction y
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_nextafter(realvec_t x, realvec_t y) {
+ realvec_t dir = y - x;
+ realvec_t offset = ldexp(RV(FP::epsilon()), ilogb(x));
+ offset = copysign(offset, dir);
+ offset = ifthen(convert_bool(as_int(x) & IV(FP::mantissa_mask)) ||
+ signbit(x) == signbit(offset),
+ offset, offset * RV(0.5));
+ realvec_t r = x + offset;
+ real_t smallest_pos = std::ldexp(FP::min(), -FP::mantissa_bits);
+ return ifthen(dir == RV(0.0), y,
+ ifthen(x == RV(0.0), copysign(RV(smallest_pos), dir), r));
+}
+
}; // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_CONVERT_H
+#endif // #ifndef MATHFUNCS_CONVERT_H
diff --git a/mathfuncs_exp.h b/mathfuncs_exp.h
index d357a21..e35fb1b 100644
--- a/mathfuncs_exp.h
+++ b/mathfuncs_exp.h
@@ -7,156 +7,145 @@
#include <cmath>
+namespace vecmathlib {
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_exp2(realvec_t x) {
+ // TODO: Check SLEEF 2.80 algorithm
+ // (in particular the improved-precision truncation)
+
+ // Rescale
+ realvec_t x0 = x;
+
+// realvec_t round_x = rint(x);
+// intvec_t iround_x = convert_int(round_x);
+// r = ldexp(r, iround_x);
-namespace vecmathlib {
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_exp2(realvec_t x)
- {
- // TODO: Check SLEEF 2.80 algorithm
- // (in particular the improved-precision truncation)
-
- // Rescale
- realvec_t x0 = x;
-
- // realvec_t round_x = rint(x);
- // intvec_t iround_x = convert_int(round_x);
- // r = ldexp(r, iround_x);
-
#if 0
// Straightforward implementation
realvec_t round_x = rint(x);
x -= round_x;
#elif 1
- // Round by adding, then subtracting again a large number
- // Add a large number to move the mantissa bits to the right
- int_t large = (U(1) << FP::mantissa_bits) + FP::exponent_offset;
- realvec_t tmp = x + RV(R(large));
- tmp.barrier();
-
- realvec_t round_x = tmp - RV(R(large));
- x -= round_x;
+ // Round by adding, then subtracting again a large number
+ // Add a large number to move the mantissa bits to the right
+ int_t large = (U(1) << FP::mantissa_bits) + FP::exponent_offset;
+ realvec_t tmp = x + RV(R(large));
+ tmp.barrier();
+
+ realvec_t round_x = tmp - RV(R(large));
+ x -= round_x;
#else
- // Straightforward implementation, using round instead of rint,
- // since round is faster for QPX
- realvec_t round_x = round(x);
- x -= round_x;
+ // Straightforward implementation, using round instead of rint,
+ // since round is faster for QPX
+ realvec_t round_x = round(x);
+ x -= round_x;
#endif
- VML_ASSERT(all(x >= RV(-0.5) && x <= RV(0.5)));
-
- // Polynomial expansion
- realvec_t r;
- switch (sizeof(real_t)) {
- case 4:
+ VML_ASSERT(all(x >= RV(-0.5) && x <= RV(0.5)));
+
+ // Polynomial expansion
+ realvec_t r;
+ switch (sizeof(real_t)) {
+ case 4:
#ifdef VML_HAVE_FP_CONTRACT
- // float, error=4.55549108005200277750378992345e-9
- r = RV(0.000154653240842602623787395880898);
- r = mad(r, x, RV(0.00133952915439234389712105060319));
- r = mad(r, x, RV(0.0096180399118156827664944870552));
- r = mad(r, x, RV(0.055503406540531310853149866446));
- r = mad(r, x, RV(0.240226511015459465468737123346));
- r = mad(r, x, RV(0.69314720007380208630542805293));
- r = mad(r, x, RV(0.99999999997182023878745628977));
+ // float, error=4.55549108005200277750378992345e-9
+ r = RV(0.000154653240842602623787395880898);
+ r = mad(r, x, RV(0.00133952915439234389712105060319));
+ r = mad(r, x, RV(0.0096180399118156827664944870552));
+ r = mad(r, x, RV(0.055503406540531310853149866446));
+ r = mad(r, x, RV(0.240226511015459465468737123346));
+ r = mad(r, x, RV(0.69314720007380208630542805293));
+ r = mad(r, x, RV(0.99999999997182023878745628977));
#else
- // float, error=1.62772721960621336664735896836e-7
- r = RV(0.00133952915439234389712105060319);
- r = mad(r, x, RV(0.009670773148229417605024318985));
- r = mad(r, x, RV(0.055503406540531310853149866446));
- r = mad(r, x, RV(0.240222115700585316818177639177));
- r = mad(r, x, RV(0.69314720007380208630542805293));
- r = mad(r, x, RV(1.00000005230745711373079206024));
+ // float, error=1.62772721960621336664735896836e-7
+ r = RV(0.00133952915439234389712105060319);
+ r = mad(r, x, RV(0.009670773148229417605024318985));
+ r = mad(r, x, RV(0.055503406540531310853149866446));
+ r = mad(r, x, RV(0.240222115700585316818177639177));
+ r = mad(r, x, RV(0.69314720007380208630542805293));
+ r = mad(r, x, RV(1.00000005230745711373079206024));
#endif
- break;
- case 8:
+ break;
+ case 8:
#ifdef VML_HAVE_FP_CONTRACT
- // double, error=9.32016781355638010975628074746e-18
- r = RV(4.45623165388261696886670014471e-10);
- r = mad(r, x, RV(7.0733589360775271430968224806e-9));
- r = mad(r, x, RV(1.01780540270960163558119510246e-7));
- r = mad(r, x, RV(1.3215437348041505269462510712e-6));
- r = mad(r, x, RV(0.000015252733849766201174247690629));
- r = mad(r, x, RV(0.000154035304541242555115696403795));
- r = mad(r, x, RV(0.00133335581463968601407096905671));
- r = mad(r, x, RV(0.0096181291075949686712855561931));
- r = mad(r, x, RV(0.055504108664821672870565883052));
- r = mad(r, x, RV(0.240226506959101382690753994082));
- r = mad(r, x, RV(0.69314718055994530864272481773));
- r = mad(r, x, RV(0.9999999999999999978508676375));
+ // double, error=9.32016781355638010975628074746e-18
+ r = RV(4.45623165388261696886670014471e-10);
+ r = mad(r, x, RV(7.0733589360775271430968224806e-9));
+ r = mad(r, x, RV(1.01780540270960163558119510246e-7));
+ r = mad(r, x, RV(1.3215437348041505269462510712e-6));
+ r = mad(r, x, RV(0.000015252733849766201174247690629));
+ r = mad(r, x, RV(0.000154035304541242555115696403795));
+ r = mad(r, x, RV(0.00133335581463968601407096905671));
+ r = mad(r, x, RV(0.0096181291075949686712855561931));
+ r = mad(r, x, RV(0.055504108664821672870565883052));
+ r = mad(r, x, RV(0.240226506959101382690753994082));
+ r = mad(r, x, RV(0.69314718055994530864272481773));
+ r = mad(r, x, RV(0.9999999999999999978508676375));
#else
- // double, error=3.74939899823302048807873981077e-14
- r = RV(1.02072375599725694063203809188e-7);
- r = mad(r, x, RV(1.32573274434801314145133004073e-6));
- r = mad(r, x, RV(0.0000152526647170731944840736190013));
- r = mad(r, x, RV(0.000154034441925859828261898614555));
- r = mad(r, x, RV(0.00133335582175770747495287552557));
- r = mad(r, x, RV(0.0096181291794939392517233403183));
- r = mad(r, x, RV(0.055504108664525029438908798685));
- r = mad(r, x, RV(0.240226506957026959772247598695));
- r = mad(r, x, RV(0.6931471805599487321347668143));
- r = mad(r, x, RV(1.00000000000000942892870993489));
+ // double, error=3.74939899823302048807873981077e-14
+ r = RV(1.02072375599725694063203809188e-7);
+ r = mad(r, x, RV(1.32573274434801314145133004073e-6));
+ r = mad(r, x, RV(0.0000152526647170731944840736190013));
+ r = mad(r, x, RV(0.000154034441925859828261898614555));
+ r = mad(r, x, RV(0.00133335582175770747495287552557));
+ r = mad(r, x, RV(0.0096181291794939392517233403183));
+ r = mad(r, x, RV(0.055504108664525029438908798685));
+ r = mad(r, x, RV(0.240226506957026959772247598695));
+ r = mad(r, x, RV(0.6931471805599487321347668143));
+ r = mad(r, x, RV(1.00000000000000942892870993489));
#endif
- break;
- default:
- __builtin_unreachable();
- }
-
- // Undo rescaling
+ break;
+ default:
+ __builtin_unreachable();
+ }
+
+// Undo rescaling
#if 0
// Straightforward implementation
r = ldexp(r, convert_int(round_x));
#elif 1
- // Use direct integer manipulation
- // Extract integer as lowest mantissa bits (highest bits still
- // contain offset, exponent, and sign)
- intvec_t itmp = as_int(tmp);
- // Construct scale factor by setting exponent (this shifts out the
- // highest bits)
- realvec_t scale = as_float(itmp << I(FP::mantissa_bits));
- r *= scale;
+ // Use direct integer manipulation
+ // Extract integer as lowest mantissa bits (highest bits still
+ // contain offset, exponent, and sign)
+ intvec_t itmp = as_int(tmp);
+ // Construct scale factor by setting exponent (this shifts out the
+ // highest bits)
+ realvec_t scale = as_float(itmp << I(FP::mantissa_bits));
+ r *= scale;
#else
- // Use floating point operations instead of integer operations,
- // since these are faster for QPX
- real_t exponent_factor = R(I(1) << I(FP::mantissa_bits));
- real_t exponent_offset = R(I(FP::exponent_offset) << I(FP::mantissa_bits));
- realvec_t exponent = mad(round_x, RV(exponent_factor), RV(exponent_offset));
- realvec_t scale = as_float(convert_int(exponent));
- r *= scale;
+ // Use floating point operations instead of integer operations,
+ // since these are faster for QPX
+ real_t exponent_factor = R(I(1) << I(FP::mantissa_bits));
+ real_t exponent_offset = R(I(FP::exponent_offset) << I(FP::mantissa_bits));
+ realvec_t exponent = mad(round_x, RV(exponent_factor), RV(exponent_offset));
+ realvec_t scale = as_float(convert_int(exponent));
+ r *= scale;
#endif
-
- r = ifthen(x0 < RV(R(FP::min_exponent)), RV(0.0), r);
-
- return r;
- }
-
-
-
- template<typename realvec_t>
- inline
- realvec_t mathfuncs<realvec_t>::vml_exp(realvec_t x)
- {
- return exp2(RV(M_LOG2E) * x);
- }
- template<typename realvec_t>
- inline
- realvec_t mathfuncs<realvec_t>::vml_exp10(realvec_t x)
- {
- return exp2(RV(M_LOG2E * M_LN10) * x);
- }
+ r = ifthen(x0 < RV(R(FP::min_exponent)), RV(0.0), r);
+
+ return r;
+}
- template<typename realvec_t>
- inline
- realvec_t mathfuncs<realvec_t>::vml_expm1(realvec_t x)
- {
- // TODO: improve this
- return exp(x) - RV(1.0);
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_exp(realvec_t x) {
+ return exp2(RV(M_LOG2E) * x);
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_exp10(realvec_t x) {
+ return exp2(RV(M_LOG2E * M_LN10) * x);
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_expm1(realvec_t x) {
+ // TODO: improve this
+ return exp(x) - RV(1.0);
#if 0
r = exp(x) - RV(1.0);
return ifthen(r == RV(0.0), x, r);
#endif
- }
-
+}
+
}; // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_EXP_H
+#endif // #ifndef MATHFUNCS_EXP_H
diff --git a/mathfuncs_fabs.h b/mathfuncs_fabs.h
index 4f31dec..c3f7356 100644
--- a/mathfuncs_fabs.h
+++ b/mathfuncs_fabs.h
@@ -7,201 +7,176 @@
#include <cmath>
+namespace vecmathlib {
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y) {
+ intvec_t value = as_int(x) & IV(U(~FP::signbit_mask));
+ intvec_t sign = as_int(y) & IV(FP::signbit_mask);
+ return as_float(sign | value);
+}
-namespace vecmathlib {
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y)
- {
- intvec_t value = as_int(x) & IV(U(~FP::signbit_mask));
- intvec_t sign = as_int(y) & IV(FP::signbit_mask);
- return as_float(sign | value);
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x)
- {
- return as_float(as_int(x) & IV(U(~FP::signbit_mask)));
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_fdim(realvec_t x, realvec_t y)
- {
- // return ifthen(x > y, x - y, RV(0.0));
- return fmax(x - y, RV(0.0));
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_fma(realvec_t x, realvec_t y, realvec_t z)
- {
- return x * y + z;
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_fmax(realvec_t x, realvec_t y)
- {
- return ifthen(x < y, y, x);
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_fmin(realvec_t x, realvec_t y)
- {
- return ifthen(y < x, y, x);
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_frexp(realvec_t x,
- typename realvec_t::intvec_t* irp)
- {
- intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
- intvec_t ir = e - IV(FP::exponent_offset - 1);
- ir = ifthen(convert_bool(e), ir, IV(std::numeric_limits<int_t>::min()));
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x) {
+ return as_float(as_int(x) & IV(U(~FP::signbit_mask)));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fdim(realvec_t x, realvec_t y) {
+ // return ifthen(x > y, x - y, RV(0.0));
+ return fmax(x - y, RV(0.0));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fma(realvec_t x, realvec_t y, realvec_t z) {
+ return x * y + z;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fmax(realvec_t x, realvec_t y) {
+ return ifthen(x < y, y, x);
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fmin(realvec_t x, realvec_t y) {
+ return ifthen(y < x, y, x);
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_frexp(realvec_t x,
+ typename realvec_t::intvec_t *irp) {
+ intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
+ intvec_t ir = e - IV(FP::exponent_offset - 1);
+ ir = ifthen(convert_bool(e), ir, IV(std::numeric_limits<int_t>::min()));
#if defined VML_HAVE_INF
- ir = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), ir);
+ ir = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), ir);
#endif
#if defined VML_HAVE_NAN
- ir = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), ir);
+ ir = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), ir);
#endif
- realvec_t r =
+ realvec_t r =
as_float((as_int(x) & IV(FP::signbit_mask | FP::mantissa_mask)) |
IV(FP::as_int(R(0.5)) & FP::exponent_mask));
- boolvec_t iszero = x == RV(0.0);
- ir = ifthen(iszero, IV(I(0)), ir);
- r = ifthen(iszero, copysign(RV(R(0.0)), r), r);
- *irp = ir;
- return r;
- }
-
- template<typename realvec_t>
- typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_ilogb(realvec_t x)
- {
- // TODO: Check SLEEF 2.80 algorithm
- intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
- intvec_t r = e - IV(FP::exponent_offset);
- r = ifthen(convert_bool(e), r, IV(std::numeric_limits<int_t>::min()));
+ boolvec_t iszero = x == RV(0.0);
+ ir = ifthen(iszero, IV(I(0)), ir);
+ r = ifthen(iszero, copysign(RV(R(0.0)), r), r);
+ *irp = ir;
+ return r;
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_ilogb(realvec_t x) {
+ // TODO: Check SLEEF 2.80 algorithm
+ intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
+ intvec_t r = e - IV(FP::exponent_offset);
+ r = ifthen(convert_bool(e), r, IV(std::numeric_limits<int_t>::min()));
#if defined VML_HAVE_INF
- r = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), r);
+ r = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), r);
#endif
#if defined VML_HAVE_NAN
- r = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), r);
+ r = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), r);
#endif
- return r;
- }
-
- template<typename realvec_t>
- typename realvec_t::boolvec_t
- mathfuncs<realvec_t>::vml_ieee_isfinite(realvec_t x)
- {
- return (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask);
- }
-
- template<typename realvec_t>
- typename realvec_t::boolvec_t
- mathfuncs<realvec_t>::vml_ieee_isinf(realvec_t x)
- {
- return (as_int(x) & IV(I(~FP::signbit_mask))) == IV(FP::exponent_mask);
- }
-
- template<typename realvec_t>
- typename realvec_t::boolvec_t
- mathfuncs<realvec_t>::vml_ieee_isnan(realvec_t x)
- {
- return
- (as_int(x) & IV(FP::exponent_mask)) == IV(FP::exponent_mask) &&
- (as_int(x) & IV(FP::mantissa_mask)) != IV(I(0));
- }
-
- template<typename realvec_t>
- typename realvec_t::boolvec_t
- mathfuncs<realvec_t>::vml_ieee_isnormal(realvec_t x)
- {
- return
- (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask) &&
- (as_int(x) & IV(FP::exponent_mask)) != IV(I(0));
- }
-
- template<typename realvec_t>
- typename realvec_t::boolvec_t
- mathfuncs<realvec_t>::vml_isfinite(realvec_t x)
- {
+ return r;
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t
+mathfuncs<realvec_t>::vml_ieee_isfinite(realvec_t x) {
+ return (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask);
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t
+mathfuncs<realvec_t>::vml_ieee_isinf(realvec_t x) {
+ return (as_int(x) & IV(I(~FP::signbit_mask))) == IV(FP::exponent_mask);
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t
+mathfuncs<realvec_t>::vml_ieee_isnan(realvec_t x) {
+ return (as_int(x) & IV(FP::exponent_mask)) == IV(FP::exponent_mask) &&
+ (as_int(x) & IV(FP::mantissa_mask)) != IV(I(0));
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t
+mathfuncs<realvec_t>::vml_ieee_isnormal(realvec_t x) {
+ return (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask) &&
+ (as_int(x) & IV(FP::exponent_mask)) != IV(I(0));
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isfinite(realvec_t x) {
#if defined VML_HAVE_INF || defined VML_HAVE_NAN
- return vml_ieee_isfinite(x);
+ return vml_ieee_isfinite(x);
#else
- return BV(true);
+ return BV(true);
#endif
- }
-
- template<typename realvec_t>
- typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isinf(realvec_t x)
- {
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isinf(realvec_t x) {
#if defined VML_HAVE_INF
- return vml_ieee_isinf(x);
+ return vml_ieee_isinf(x);
#else
- return BV(false);
+ return BV(false);
#endif
- }
-
- template<typename realvec_t>
- typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnan(realvec_t x)
- {
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnan(realvec_t x) {
#if defined VML_HAVE_NAN
- return vml_ieee_isnan(x);
+ return vml_ieee_isnan(x);
#else
- return BV(false);
+ return BV(false);
#endif
- }
-
- template<typename realvec_t>
- typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnormal(realvec_t x)
- {
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnormal(realvec_t x) {
#if defined VML_HAVE_DENORMALS || defined VML_HAVE_INF || defined VML_HAVE_NAN
- return vml_ieee_isnormal(x);
+ return vml_ieee_isnormal(x);
#else
- return BV(true);
+ return BV(true);
#endif
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_ldexp(realvec_t x, intvec_t n)
- {
- // TODO: Check SLEEF 2.80 algorithm
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_ldexp(realvec_t x, intvec_t n) {
+// TODO: Check SLEEF 2.80 algorithm
#if 0
realvec_t r = as_float(as_int(x) + (n << I(FP::mantissa_bits)));
r = ifthen((as_int(x) & IV(FP::exponent_mask)) == IV(I(0)), x, r);
return r;
#endif
- realvec_t r = as_float(as_int(x) + (n << U(FP::mantissa_bits)));
- int max_n = FP::max_exponent - FP::min_exponent;
- boolvec_t underflow = n < IV(I(-max_n));
- boolvec_t overflow = n > IV(I(max_n));
- intvec_t old_exp =
- lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
- intvec_t new_exp = old_exp + n;
- // TODO: check bit patterns instead
- underflow =
+ realvec_t r = as_float(as_int(x) + (n << U(FP::mantissa_bits)));
+ int max_n = FP::max_exponent - FP::min_exponent;
+ boolvec_t underflow = n < IV(I(-max_n));
+ boolvec_t overflow = n > IV(I(max_n));
+ intvec_t old_exp = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits);
+ intvec_t new_exp = old_exp + n;
+ // TODO: check bit patterns instead
+ underflow =
underflow || new_exp < IV(I(FP::min_exponent + FP::exponent_offset));
- overflow =
+ overflow =
overflow || new_exp > IV(I(FP::max_exponent + FP::exponent_offset));
- r = ifthen(underflow, copysign(RV(R(0.0)), x), r);
- r = ifthen(overflow, copysign(RV(FP::infinity()), x), r);
- boolvec_t dont_change = x == RV(R(0.0)) || isinf(x) || isnan(x);
- r = ifthen(dont_change, x, r);
- return r;
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_mad(realvec_t x, realvec_t y, realvec_t z)
- {
- return x * y + z;
- }
-
- template<typename realvec_t>
- typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_signbit(realvec_t x)
- {
- return convert_bool(as_int(x) & IV(FP::signbit_mask));
- }
-
+ r = ifthen(underflow, copysign(RV(R(0.0)), x), r);
+ r = ifthen(overflow, copysign(RV(FP::infinity()), x), r);
+ boolvec_t dont_change = x == RV(R(0.0)) || isinf(x) || isnan(x);
+ r = ifthen(dont_change, x, r);
+ return r;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_mad(realvec_t x, realvec_t y, realvec_t z) {
+ return x * y + z;
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_signbit(realvec_t x) {
+ return convert_bool(as_int(x) & IV(FP::signbit_mask));
+}
+
}; // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_FABS_H
+#endif // #ifndef MATHFUNCS_FABS_H
diff --git a/mathfuncs_int.h b/mathfuncs_int.h
index 862189d..fff65ff 100644
--- a/mathfuncs_int.h
+++ b/mathfuncs_int.h
@@ -7,129 +7,128 @@
#include <climits>
+namespace vecmathlib {
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_abs(intvec_t x) {
+ return ifthen(isignbit(x), -x, x);
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t
+mathfuncs<realvec_t>::vml_bitifthen(intvec_t x, intvec_t y, intvec_t z) {
+ return (x & y) | (~x & z);
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_clz(intvec_t x) {
+ // These implementations return 8*sizeof(TYPE) when the input is 0
+
+ // These explicit implementations are taken from
+ // <http://aggregate.org/MAGIC/>:
+ //
+ // @techreport{magicalgorithms,
+ // author={Henry Gordon Dietz},
+ // title={{The Aggregate Magic Algorithms}},
+ // institution={University of Kentucky},
+ // howpublished={Aggregate.Org online technical report},
+ // date={2013-03-25},
+ // URL={http://aggregate.org/MAGIC/}
+ // }
+
+ int_t bits = CHAR_BIT * sizeof(int_t);
+ if (bits > 1)
+ x |= lsr(x, 1);
+ if (bits > 2)
+ x |= lsr(x, 2);
+ if (bits > 4)
+ x |= lsr(x, 4);
+ if (bits > 8)
+ x |= lsr(x, 8);
+ if (bits > 16)
+ x |= lsr(x, 16);
+ if (bits > 32)
+ x |= lsr(x, 32);
+ if (bits > 64)
+ x |= lsr(x, 64);
+ assert(bits <= 128);
+ return IV(I(bits)) - popcount(x);
+}
+
+template <typename realvec_t>
+typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isignbit(intvec_t x) {
+ return x < IV(I(0));
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_max(intvec_t x,
+ intvec_t y) {
+ return ifthen(x >= y, x, y);
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_min(intvec_t x,
+ intvec_t y) {
+ return ifthen(x < y, x, y);
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_popcount(intvec_t x) {
+ // These explicit implementations are taken from
+ // <http://aggregate.org/MAGIC/>:
+ //
+ // @techreport{magicalgorithms,
+ // author={Henry Gordon Dietz},
+ // title={{The Aggregate Magic Algorithms}},
+ // institution={University of Kentucky},
+ // howpublished={Aggregate.Org online technical report},
+ // date={2013-03-25},
+ // URL={http://aggregate.org/MAGIC/}
+ // }
+
+ int_t bits = CHAR_BIT * sizeof(int_t);
+
+ // intvec_t x55 = IV(FP::replicate_byte(0x55));
+ // intvec_t x33 = IV(FP::replicate_byte(0x33));
+ // intvec_t x0f = IV(FP::replicate_byte(0x0f));
+ intvec_t x55 = I(~U(0) / U(3)); // 0x0101...
+ intvec_t x33 = I(~U(0) / U(5)); // 0x00110011...
+ intvec_t x0f = I(~U(0) / U(17)); // 0b0000111100001111...
+
+ x -= lsr(x, I(1)) & x55;
+ x = (x & x33) + (lsr(x, I(2)) & x33);
+ x += lsr(x, I(4));
+ x &= x0f;
+ if (bits > 8)
+ x += lsr(x, I(8));
+ if (bits > 16)
+ x += lsr(x, I(16));
+ if (bits > 32)
+ x += lsr(x, I(32));
+ if (bits > 64)
+ x += lsr(x, I(64));
+ assert(bits <= 128);
+ return x & IV(I(0xff));
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x,
+ int_t n) {
+ int_t mask = CHAR_BIT * sizeof(int_t) - 1;
+ intvec_t left = x << (n & mask);
+ intvec_t right = lsr(x, -n & mask);
+ return left | right;
+}
+
+template <typename realvec_t>
+typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x,
+ intvec_t n) {
+ intvec_t mask = IV(I(CHAR_BIT * sizeof(int_t) - 1));
+ intvec_t left = x << (n & mask);
+ intvec_t right = lsr(x, -n & mask);
+ return left | right;
+}
-namespace vecmathlib {
-
- template<typename realvec_t>
- typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_abs(intvec_t x)
- {
- return ifthen(isignbit(x), -x, x);
- }
-
- template<typename realvec_t>
- typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_bitifthen(intvec_t x,
- intvec_t y,
- intvec_t z)
- {
- return (x & y) | (~x & z);
- }
-
- template<typename realvec_t>
- typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_clz(intvec_t x)
- {
- // These implementations return 8*sizeof(TYPE) when the input is 0
-
- // These explicit implementations are taken from
- // <http://aggregate.org/MAGIC/>:
- //
- // @techreport{magicalgorithms,
- // author={Henry Gordon Dietz},
- // title={{The Aggregate Magic Algorithms}},
- // institution={University of Kentucky},
- // howpublished={Aggregate.Org online technical report},
- // date={2013-03-25},
- // URL={http://aggregate.org/MAGIC/}
- // }
-
- int_t bits = CHAR_BIT * sizeof(int_t);
- if (bits > 1) x |= lsr(x, 1);
- if (bits > 2) x |= lsr(x, 2);
- if (bits > 4) x |= lsr(x, 4);
- if (bits > 8) x |= lsr(x, 8);
- if (bits > 16) x |= lsr(x, 16);
- if (bits > 32) x |= lsr(x, 32);
- if (bits > 64) x |= lsr(x, 64);
- assert(bits<=128);
- return IV(I(bits)) - popcount(x);
- }
-
- template<typename realvec_t>
- typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isignbit(intvec_t x)
- {
- return x < IV(I(0));
- }
-
- template<typename realvec_t>
- typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_max(intvec_t x,
- intvec_t y)
- {
- return ifthen(x>=y, x, y);
- }
-
- template<typename realvec_t>
- typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_min(intvec_t x,
- intvec_t y)
- {
- return ifthen(x<y, x, y);
- }
-
- template<typename realvec_t>
- typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_popcount(intvec_t x)
- {
- // These explicit implementations are taken from
- // <http://aggregate.org/MAGIC/>:
- //
- // @techreport{magicalgorithms,
- // author={Henry Gordon Dietz},
- // title={{The Aggregate Magic Algorithms}},
- // institution={University of Kentucky},
- // howpublished={Aggregate.Org online technical report},
- // date={2013-03-25},
- // URL={http://aggregate.org/MAGIC/}
- // }
-
- int_t bits = CHAR_BIT * sizeof(int_t);
-
- // intvec_t x55 = IV(FP::replicate_byte(0x55));
- // intvec_t x33 = IV(FP::replicate_byte(0x33));
- // intvec_t x0f = IV(FP::replicate_byte(0x0f));
- intvec_t x55 = I(~U(0) / U(3)); // 0x0101...
- intvec_t x33 = I(~U(0) / U(5)); // 0x00110011...
- intvec_t x0f = I(~U(0) / U(17)); // 0b0000111100001111...
-
- x -= lsr(x, I(1)) & x55;
- x = (x & x33) + (lsr(x, I(2)) & x33);
- x += lsr(x, I(4));
- x &= x0f;
- if (bits > 8) x += lsr(x, I(8));
- if (bits > 16) x += lsr(x, I(16));
- if (bits > 32) x += lsr(x, I(32));
- if (bits > 64) x += lsr(x, I(64));
- assert(bits<=128);
- return x & IV(I(0xff));
- }
-
- template<typename realvec_t>
- typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x,
- int_t n)
- {
- int_t mask = CHAR_BIT * sizeof(int_t) - 1;
- intvec_t left = x << (n & mask);
- intvec_t right = lsr(x, -n & mask);
- return left | right;
- }
-
- template<typename realvec_t>
- typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x,
- intvec_t n)
- {
- intvec_t mask = IV(I(CHAR_BIT * sizeof(int_t) - 1));
- intvec_t left = x << (n & mask);
- intvec_t right = lsr(x, -n & mask);
- return left | right;
- }
-
}; // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_ASIN_H
+#endif // #ifndef MATHFUNCS_ASIN_H
diff --git a/mathfuncs_log.h b/mathfuncs_log.h
index cd71eb3..fa517ba 100644
--- a/mathfuncs_log.h
+++ b/mathfuncs_log.h
@@ -7,93 +7,82 @@
#include <cmath>
+namespace vecmathlib {
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_log2(realvec_t x) {
+ // Algorithm inspired by SLEEF 2.80
-namespace vecmathlib {
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_log2(realvec_t x)
- {
- // Algorithm inspired by SLEEF 2.80
-
- // Rescale
- intvec_t ilogb_x = ilogb(x * RV(M_SQRT2));
- x = ldexp(x, -ilogb_x);
- VML_ASSERT(all(x >= RV(M_SQRT1_2) && x <= RV(M_SQRT2)));
-
- realvec_t y = (x - RV(1.0)) / (x + RV(1.0));
- realvec_t y2 = y*y;
-
- realvec_t r;
- switch (sizeof(real_t)) {
- case 4:
- // float, error=7.09807175879142775648452461821e-8
- r = RV(0.59723611417135718739797302426);
- r = mad(r, y2, RV(0.961524413175528426101613434));
- r = mad(r, y2, RV(2.88539097665498228703236701));
- break;
- case 8:
+ // Rescale
+ intvec_t ilogb_x = ilogb(x * RV(M_SQRT2));
+ x = ldexp(x, -ilogb_x);
+ VML_ASSERT(all(x >= RV(M_SQRT1_2) && x <= RV(M_SQRT2)));
+
+ realvec_t y = (x - RV(1.0)) / (x + RV(1.0));
+ realvec_t y2 = y * y;
+
+ realvec_t r;
+ switch (sizeof(real_t)) {
+ case 4:
+ // float, error=7.09807175879142775648452461821e-8
+ r = RV(0.59723611417135718739797302426);
+ r = mad(r, y2, RV(0.961524413175528426101613434));
+ r = mad(r, y2, RV(2.88539097665498228703236701));
+ break;
+ case 8:
#ifdef VML_HAVE_FP_CONTRACT
- // double, error=1.48294180185938512675770096324e-16
- r = RV(0.243683403415639178527756320773);
- r = mad(r, y2, RV(0.26136626803870009948502658));
- r = mad(r, y2, RV(0.320619429891299265439389));
- r = mad(r, y2, RV(0.4121983452028499242926));
- r = mad(r, y2, RV(0.577078017761894161436));
- r = mad(r, y2, RV(0.96179669392233355927));
- r = mad(r, y2, RV(2.8853900817779295236));
+ // double, error=1.48294180185938512675770096324e-16
+ r = RV(0.243683403415639178527756320773);
+ r = mad(r, y2, RV(0.26136626803870009948502658));
+ r = mad(r, y2, RV(0.320619429891299265439389));
+ r = mad(r, y2, RV(0.4121983452028499242926));
+ r = mad(r, y2, RV(0.577078017761894161436));
+ r = mad(r, y2, RV(0.96179669392233355927));
+ r = mad(r, y2, RV(2.8853900817779295236));
#else
- // double, error=2.1410114030383689267772704676e-14
- r = RV(0.283751646449323373643963474845);
- r = mad(r, y2, RV(0.31983138095551191299118812));
- r = mad(r, y2, RV(0.412211603844146279666022));
- r = mad(r, y2, RV(0.5770779098948940070516));
- r = mad(r, y2, RV(0.961796694295973716912));
- r = mad(r, y2, RV(2.885390081777562819196));
+ // double, error=2.1410114030383689267772704676e-14
+ r = RV(0.283751646449323373643963474845);
+ r = mad(r, y2, RV(0.31983138095551191299118812));
+ r = mad(r, y2, RV(0.412211603844146279666022));
+ r = mad(r, y2, RV(0.5770779098948940070516));
+ r = mad(r, y2, RV(0.961796694295973716912));
+ r = mad(r, y2, RV(2.885390081777562819196));
#endif
- break;
- default:
- __builtin_unreachable();
- }
- r *= y;
-
- // Undo rescaling
- r += convert_float(ilogb_x);
-
- return r;
- }
-
-
-
- template<typename realvec_t>
- inline
- realvec_t mathfuncs<realvec_t>::vml_log(realvec_t x)
- {
- return log2(x) * RV(M_LN2);
+ break;
+ default:
+ __builtin_unreachable();
}
+ r *= y;
- template<typename realvec_t>
- inline
- realvec_t mathfuncs<realvec_t>::vml_log10(realvec_t x)
- {
- return log(x) * RV(M_LOG10E);
- }
+ // Undo rescaling
+ r += convert_float(ilogb_x);
- template<typename realvec_t>
- inline
- realvec_t mathfuncs<realvec_t>::vml_log1p(realvec_t x)
- {
- // TODO: Check SLEEF 2.80 algorithm
-
- return log(RV(1.0) + x);
+ return r;
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_log(realvec_t x) {
+ return log2(x) * RV(M_LN2);
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_log10(realvec_t x) {
+ return log(x) * RV(M_LOG10E);
+}
+
+template <typename realvec_t>
+inline realvec_t mathfuncs<realvec_t>::vml_log1p(realvec_t x) {
+ // TODO: Check SLEEF 2.80 algorithm
+
+ return log(RV(1.0) + x);
#if 0
// Goldberg, theorem 4
realvec_t x1 = RV(1.0) + x;
x1.barrier();
return ifthen(x1 == x, x, x * log(x1) / (x1 - RV(1.0)));
#endif
- }
-
+}
+
}; // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_LOG_H
+#endif // #ifndef MATHFUNCS_LOG_H
diff --git a/mathfuncs_pow.h b/mathfuncs_pow.h
index b863570..70bcc80 100644
--- a/mathfuncs_pow.h
+++ b/mathfuncs_pow.h
@@ -7,30 +7,27 @@
#include <cmath>
+namespace vecmathlib {
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_pow(realvec_t x, realvec_t y) {
+ // Handle zero
+ boolvec_t is_zero = x == RV(0.0);
+ x = ifthen(is_zero, RV(1.0), x);
+
+ realvec_t r = exp(log(fabs(x)) * y);
+
+ // The result is negative if x<0 and if y is integer and odd
+ realvec_t mod_y = fabs(y) - RV(2.0) * floor(RV(0.5) * fabs(y));
+ realvec_t sign = copysign(mod_y, x) + RV(0.5);
+ r = copysign(r, sign);
+
+ // Handle zero
+ r = ifthen(is_zero, RV(0.0), r);
+
+ return r;
+}
-namespace vecmathlib {
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_pow(realvec_t x, realvec_t y)
- {
- // Handle zero
- boolvec_t is_zero = x == RV(0.0);
- x = ifthen(is_zero, RV(1.0), x);
-
- realvec_t r = exp(log(fabs(x)) * y);
-
- // The result is negative if x<0 and if y is integer and odd
- realvec_t mod_y = fabs(y) - RV(2.0) * floor(RV(0.5) * fabs(y));
- realvec_t sign = copysign(mod_y, x) + RV(0.5);
- r = copysign(r, sign);
-
- // Handle zero
- r = ifthen(is_zero, RV(0.0), r);
-
- return r;
- }
-
}; // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_POW_H
+#endif // #ifndef MATHFUNCS_POW_H
diff --git a/mathfuncs_rcp.h b/mathfuncs_rcp.h
index 6e12b27..f703454 100644
--- a/mathfuncs_rcp.h
+++ b/mathfuncs_rcp.h
@@ -7,10 +7,8 @@
#include <cmath>
-
-
namespace vecmathlib {
-
+
#if 0
// This routine works, but may be slower than the one below
template<typename realvec_t>
@@ -50,66 +48,61 @@ namespace vecmathlib {
return r;
}
#endif
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_rcp(realvec_t x)
- {
- // Handle negative values
- realvec_t x0 = x;
- x = fabs(x);
-
- // <https://en.wikipedia.org/wiki/Division_algorithm> [2013-06-28]
-
- // Initial guess
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_rcp(realvec_t x) {
+ // Handle negative values
+ realvec_t x0 = x;
+ x = fabs(x);
+
+ // <https://en.wikipedia.org/wiki/Division_algorithm> [2013-06-28]
+
+ // Initial guess
+ VML_ASSERT(all(x > RV(0.0)));
+ intvec_t x_exp;
+ x = frexp(x, &x_exp);
+ VML_ASSERT(all(x >= RV(0.5) && x < RV(1.0)));
+ realvec_t r = RV(R(48.0) / R(17.0)) - RV(R(32.0) / R(17.0)) * x;
+
+ // Iterate
+ int const nmax = sizeof(real_t) == 4 ? 3 : 4;
+ for (int n = 0; n < nmax; ++n) {
+ // Step
VML_ASSERT(all(x > RV(0.0)));
- intvec_t x_exp;
- x = frexp(x, &x_exp);
- VML_ASSERT(all(x >= RV(0.5) && x < RV(1.0)));
- realvec_t r = RV(R(48.0)/R(17.0)) - RV(R(32.0)/R(17.0)) * x;
-
- // Iterate
- int const nmax = sizeof(real_t)==4 ? 3 : 4;
- for (int n=0; n<nmax; ++n) {
- // Step
- VML_ASSERT(all(x > RV(0.0)));
- // Newton method:
- // Solve f(r) = 0 for f(r) = x - 1/r
- // r <- r - f(r) / f'(r)
- // r <- 2 r - r^2 x
- // r <- r + r (1 - r x)
-
- // Note: don't rewrite this expression, this may introduce
- // cancellation errors
- r += r * (RV(1.0) - x*r);
-
- // NEON: r = r * (RV(2.0) - x*r);
- }
- r = ldexp(r, -x_exp);
-
- // Handle negative values
- r = copysign(r, x0);
-
- return r;
- }
-
-
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_remainder(realvec_t x, realvec_t y)
- {
- return x - rint(x / y) * y;
- // realvec_t r = x / y;
- // return y * (r - rint(r));
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_fmod(realvec_t x, realvec_t y)
- {
- return x - y * trunc(x / y);
- // realvec_t r = x / y;
- // return y * (r - trunc(r));
+ // Newton method:
+ // Solve f(r) = 0 for f(r) = x - 1/r
+ // r <- r - f(r) / f'(r)
+ // r <- 2 r - r^2 x
+ // r <- r + r (1 - r x)
+
+ // Note: don't rewrite this expression, this may introduce
+ // cancellation errors
+ r += r * (RV(1.0) - x * r);
+
+ // NEON: r = r * (RV(2.0) - x*r);
}
-
+ r = ldexp(r, -x_exp);
+
+ // Handle negative values
+ r = copysign(r, x0);
+
+ return r;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_remainder(realvec_t x, realvec_t y) {
+ return x - rint(x / y) * y;
+ // realvec_t r = x / y;
+ // return y * (r - rint(r));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_fmod(realvec_t x, realvec_t y) {
+ return x - y * trunc(x / y);
+ // realvec_t r = x / y;
+ // return y * (r - trunc(r));
+}
+
}; // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_RCP_H
+#endif // #ifndef MATHFUNCS_RCP_H
diff --git a/mathfuncs_sin.h b/mathfuncs_sin.h
index 8e2afd9..72ffb6f 100644
--- a/mathfuncs_sin.h
+++ b/mathfuncs_sin.h
@@ -7,230 +7,227 @@
#include <cmath>
+namespace vecmathlib {
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_sin(realvec_t d) {
+ // Algorithm taken from SLEEF 2.80
+
+ real_t PI4_A, PI4_B, PI4_C, PI4_D;
+ switch (sizeof(real_t)) {
+ default:
+ __builtin_unreachable();
+ case sizeof(float):
+ PI4_A = 0.78515625f;
+ PI4_B = 0.00024187564849853515625f;
+ PI4_C = 3.7747668102383613586e-08f;
+ PI4_D = 1.2816720341285448015e-12f;
+ break;
+ case sizeof(double):
+ PI4_A = 0.78539816290140151978;
+ PI4_B = 4.9604678871439933374e-10;
+ PI4_C = 1.1258708853173288931e-18;
+ PI4_D = 1.7607799325916000908e-27;
+ break;
+ }
+
+ realvec_t q = rint(d * RV(M_1_PI));
+ intvec_t iq = convert_int(q);
-namespace vecmathlib {
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_sin(realvec_t d)
- {
- // Algorithm taken from SLEEF 2.80
-
- real_t PI4_A, PI4_B, PI4_C, PI4_D;
- switch (sizeof(real_t)) {
- default: __builtin_unreachable();
- case sizeof(float):
- PI4_A = 0.78515625f;
- PI4_B = 0.00024187564849853515625f;
- PI4_C = 3.7747668102383613586e-08f;
- PI4_D = 1.2816720341285448015e-12f;
- break;
- case sizeof(double):
- PI4_A = 0.78539816290140151978;
- PI4_B = 4.9604678871439933374e-10;
- PI4_C = 1.1258708853173288931e-18;
- PI4_D = 1.7607799325916000908e-27;
- break;
- }
-
- realvec_t q = rint(d * RV(M_1_PI));
- intvec_t iq = convert_int(q);
-
#ifdef VML_HAVE_FP_CONTRACT
- d = mad(q, RV(-PI4_A*4), d);
- d = mad(q, RV(-PI4_B*4), d);
- d = mad(q, RV(-PI4_C*4), d);
- d = mad(q, RV(-PI4_D*4), d);
+ d = mad(q, RV(-PI4_A * 4), d);
+ d = mad(q, RV(-PI4_B * 4), d);
+ d = mad(q, RV(-PI4_C * 4), d);
+ d = mad(q, RV(-PI4_D * 4), d);
#else
- d = mad(q, RV(-M_PI), d);
+ d = mad(q, RV(-M_PI), d);
#endif
-
- realvec_t s = d * d;
-
- d = ifthen(convert_bool(iq & IV(I(1))), -d, d);
-
- realvec_t u;
- switch (sizeof(real_t)) {
- default: __builtin_unreachable();
- case sizeof(float):
- u = RV(2.6083159809786593541503e-06f);
- u = mad(u, s, RV(-0.0001981069071916863322258f));
- u = mad(u, s, RV(0.00833307858556509017944336f));
- u = mad(u, s, RV(-0.166666597127914428710938f));
- break;
- case sizeof(double):
- u = RV(-7.97255955009037868891952e-18);
- u = mad(u, s, RV(2.81009972710863200091251e-15));
- u = mad(u, s, RV(-7.64712219118158833288484e-13));
- u = mad(u, s, RV(1.60590430605664501629054e-10));
- u = mad(u, s, RV(-2.50521083763502045810755e-08));
- u = mad(u, s, RV(2.75573192239198747630416e-06));
- u = mad(u, s, RV(-0.000198412698412696162806809));
- u = mad(u, s, RV(0.00833333333333332974823815));
- u = mad(u, s, RV(-0.166666666666666657414808));
- break;
- }
-
- u = mad(s, u * d, d);
-
- const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
- u = ifthen(isinf(d), RV(nan), u);
-
- return u;
+
+ realvec_t s = d * d;
+
+ d = ifthen(convert_bool(iq & IV(I(1))), -d, d);
+
+ realvec_t u;
+ switch (sizeof(real_t)) {
+ default:
+ __builtin_unreachable();
+ case sizeof(float):
+ u = RV(2.6083159809786593541503e-06f);
+ u = mad(u, s, RV(-0.0001981069071916863322258f));
+ u = mad(u, s, RV(0.00833307858556509017944336f));
+ u = mad(u, s, RV(-0.166666597127914428710938f));
+ break;
+ case sizeof(double):
+ u = RV(-7.97255955009037868891952e-18);
+ u = mad(u, s, RV(2.81009972710863200091251e-15));
+ u = mad(u, s, RV(-7.64712219118158833288484e-13));
+ u = mad(u, s, RV(1.60590430605664501629054e-10));
+ u = mad(u, s, RV(-2.50521083763502045810755e-08));
+ u = mad(u, s, RV(2.75573192239198747630416e-06));
+ u = mad(u, s, RV(-0.000198412698412696162806809));
+ u = mad(u, s, RV(0.00833333333333332974823815));
+ u = mad(u, s, RV(-0.166666666666666657414808));
+ break;
+ }
+
+ u = mad(s, u * d, d);
+
+ const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
+ u = ifthen(isinf(d), RV(nan), u);
+
+ return u;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_cos(realvec_t d) {
+ // Algorithm taken from SLEEF 2.80
+
+ real_t PI4_A, PI4_B, PI4_C, PI4_D;
+ switch (sizeof(real_t)) {
+ default:
+ __builtin_unreachable();
+ case sizeof(float):
+ PI4_A = 0.78515625f;
+ PI4_B = 0.00024187564849853515625f;
+ PI4_C = 3.7747668102383613586e-08f;
+ PI4_D = 1.2816720341285448015e-12f;
+ break;
+ case sizeof(double):
+ PI4_A = 0.78539816290140151978;
+ PI4_B = 4.9604678871439933374e-10;
+ PI4_C = 1.1258708853173288931e-18;
+ PI4_D = 1.7607799325916000908e-27;
+ break;
}
-
-
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_cos(realvec_t d)
- {
- // Algorithm taken from SLEEF 2.80
-
- real_t PI4_A, PI4_B, PI4_C, PI4_D;
- switch (sizeof(real_t)) {
- default: __builtin_unreachable();
- case sizeof(float):
- PI4_A = 0.78515625f;
- PI4_B = 0.00024187564849853515625f;
- PI4_C = 3.7747668102383613586e-08f;
- PI4_D = 1.2816720341285448015e-12f;
- break;
- case sizeof(double):
- PI4_A = 0.78539816290140151978;
- PI4_B = 4.9604678871439933374e-10;
- PI4_C = 1.1258708853173288931e-18;
- PI4_D = 1.7607799325916000908e-27;
- break;
- }
-
- realvec_t q = mad(RV(2.0), rint(mad(d, RV(M_1_PI), RV(-0.5))), RV(1.0));
- intvec_t iq = convert_int(q);
-
+
+ realvec_t q = mad(RV(2.0), rint(mad(d, RV(M_1_PI), RV(-0.5))), RV(1.0));
+ intvec_t iq = convert_int(q);
+
#ifdef VML_HAVE_FP_CONTRACT
- d = mad(q, RV(-PI4_A*2), d);
- d = mad(q, RV(-PI4_B*2), d);
- d = mad(q, RV(-PI4_C*2), d);
- d = mad(q, RV(-PI4_D*2), d);
+ d = mad(q, RV(-PI4_A * 2), d);
+ d = mad(q, RV(-PI4_B * 2), d);
+ d = mad(q, RV(-PI4_C * 2), d);
+ d = mad(q, RV(-PI4_D * 2), d);
#else
- d = mad(q, RV(-M_PI_2), d);
+ d = mad(q, RV(-M_PI_2), d);
#endif
-
- realvec_t s = d * d;
-
- d = ifthen(convert_bool(iq & IV(I(2))), d, -d);
-
- realvec_t u;
- switch (sizeof(real_t)) {
- default: __builtin_unreachable();
- case sizeof(float):
- u = RV(2.6083159809786593541503e-06f);
- u = mad(u, s, RV(-0.0001981069071916863322258f));
- u = mad(u, s, RV(0.00833307858556509017944336f));
- u = mad(u, s, RV(-0.166666597127914428710938f));
- break;
- case sizeof(double):
- u = RV(-7.97255955009037868891952e-18);
- u = mad(u, s, RV(2.81009972710863200091251e-15));
- u = mad(u, s, RV(-7.64712219118158833288484e-13));
- u = mad(u, s, RV(1.60590430605664501629054e-10));
- u = mad(u, s, RV(-2.50521083763502045810755e-08));
- u = mad(u, s, RV(2.75573192239198747630416e-06));
- u = mad(u, s, RV(-0.000198412698412696162806809));
- u = mad(u, s, RV(0.00833333333333332974823815));
- u = mad(u, s, RV(-0.166666666666666657414808));
- break;
- }
-
- u = mad(s, u * d, d);
-
- const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
- u = ifthen(isinf(d), RV(nan), u);
-
- return u;
+
+ realvec_t s = d * d;
+
+ d = ifthen(convert_bool(iq & IV(I(2))), d, -d);
+
+ realvec_t u;
+ switch (sizeof(real_t)) {
+ default:
+ __builtin_unreachable();
+ case sizeof(float):
+ u = RV(2.6083159809786593541503e-06f);
+ u = mad(u, s, RV(-0.0001981069071916863322258f));
+ u = mad(u, s, RV(0.00833307858556509017944336f));
+ u = mad(u, s, RV(-0.166666597127914428710938f));
+ break;
+ case sizeof(double):
+ u = RV(-7.97255955009037868891952e-18);
+ u = mad(u, s, RV(2.81009972710863200091251e-15));
+ u = mad(u, s, RV(-7.64712219118158833288484e-13));
+ u = mad(u, s, RV(1.60590430605664501629054e-10));
+ u = mad(u, s, RV(-2.50521083763502045810755e-08));
+ u = mad(u, s, RV(2.75573192239198747630416e-06));
+ u = mad(u, s, RV(-0.000198412698412696162806809));
+ u = mad(u, s, RV(0.00833333333333332974823815));
+ u = mad(u, s, RV(-0.166666666666666657414808));
+ break;
}
-
-
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_tan(realvec_t d)
- {
- // Algorithm taken from SLEEF 2.80
-
- real_t PI4_A, PI4_B, PI4_C, PI4_D;
- switch (sizeof(real_t)) {
- default: __builtin_unreachable();
- case sizeof(float):
- PI4_A = 0.78515625f;
- PI4_B = 0.00024187564849853515625f;
- PI4_C = 3.7747668102383613586e-08f;
- PI4_D = 1.2816720341285448015e-12f;
- break;
- case sizeof(double):
- PI4_A = 0.78539816290140151978;
- PI4_B = 4.9604678871439933374e-10;
- PI4_C = 1.1258708853173288931e-18;
- PI4_D = 1.7607799325916000908e-27;
- break;
- }
-
- realvec_t q = rint(d * RV(2 * M_1_PI));
- intvec_t iq = convert_int(q);
-
- realvec_t x = d;
-
+
+ u = mad(s, u * d, d);
+
+ const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
+ u = ifthen(isinf(d), RV(nan), u);
+
+ return u;
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_tan(realvec_t d) {
+ // Algorithm taken from SLEEF 2.80
+
+ real_t PI4_A, PI4_B, PI4_C, PI4_D;
+ switch (sizeof(real_t)) {
+ default:
+ __builtin_unreachable();
+ case sizeof(float):
+ PI4_A = 0.78515625f;
+ PI4_B = 0.00024187564849853515625f;
+ PI4_C = 3.7747668102383613586e-08f;
+ PI4_D = 1.2816720341285448015e-12f;
+ break;
+ case sizeof(double):
+ PI4_A = 0.78539816290140151978;
+ PI4_B = 4.9604678871439933374e-10;
+ PI4_C = 1.1258708853173288931e-18;
+ PI4_D = 1.7607799325916000908e-27;
+ break;
+ }
+
+ realvec_t q = rint(d * RV(2 * M_1_PI));
+ intvec_t iq = convert_int(q);
+
+ realvec_t x = d;
+
#ifdef VML_HAVE_FP_CONTRACT
- x = mad(q, RV(-PI4_A*2), x);
- x = mad(q, RV(-PI4_B*2), x);
- x = mad(q, RV(-PI4_C*2), x);
- x = mad(q, RV(-PI4_D*2), x);
+ x = mad(q, RV(-PI4_A * 2), x);
+ x = mad(q, RV(-PI4_B * 2), x);
+ x = mad(q, RV(-PI4_C * 2), x);
+ x = mad(q, RV(-PI4_D * 2), x);
#else
- x = mad(q, RV(-M_PI_2), x);
+ x = mad(q, RV(-M_PI_2), x);
#endif
-
- realvec_t s = x * x;
-
- x = ifthen(convert_bool(iq & IV(I(1))), -x, x);
-
- realvec_t u;
- switch (sizeof(real_t)) {
- default: __builtin_unreachable();
- case sizeof(float):
- u = RV(0.00927245803177356719970703f);
- u = mad(u, s, RV(0.00331984995864331722259521f));
- u = mad(u, s, RV(0.0242998078465461730957031f));
- u = mad(u, s, RV(0.0534495301544666290283203f));
- u = mad(u, s, RV(0.133383005857467651367188f));
- u = mad(u, s, RV(0.333331853151321411132812f));
- break;
- case sizeof(double):
- u = RV(1.01419718511083373224408e-05);
- u = mad(u, s, RV(-2.59519791585924697698614e-05));
- u = mad(u, s, RV(5.23388081915899855325186e-05));
- u = mad(u, s, RV(-3.05033014433946488225616e-05));
- u = mad(u, s, RV(7.14707504084242744267497e-05));
- u = mad(u, s, RV(8.09674518280159187045078e-05));
- u = mad(u, s, RV(0.000244884931879331847054404));
- u = mad(u, s, RV(0.000588505168743587154904506));
- u = mad(u, s, RV(0.00145612788922812427978848));
- u = mad(u, s, RV(0.00359208743836906619142924));
- u = mad(u, s, RV(0.00886323944362401618113356));
- u = mad(u, s, RV(0.0218694882853846389592078));
- u = mad(u, s, RV(0.0539682539781298417636002));
- u = mad(u, s, RV(0.133333333333125941821962));
- u = mad(u, s, RV(0.333333333333334980164153));
- break;
- }
-
- u = mad(s, u * x, x);
-
- u = ifthen(convert_bool(iq & IV(I(1))), rcp(u), u);
-
- const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
- u = ifthen(isinf(d), RV(nan), u);
-
- return u;
+
+ realvec_t s = x * x;
+
+ x = ifthen(convert_bool(iq & IV(I(1))), -x, x);
+
+ realvec_t u;
+ switch (sizeof(real_t)) {
+ default:
+ __builtin_unreachable();
+ case sizeof(float):
+ u = RV(0.00927245803177356719970703f);
+ u = mad(u, s, RV(0.00331984995864331722259521f));
+ u = mad(u, s, RV(0.0242998078465461730957031f));
+ u = mad(u, s, RV(0.0534495301544666290283203f));
+ u = mad(u, s, RV(0.133383005857467651367188f));
+ u = mad(u, s, RV(0.333331853151321411132812f));
+ break;
+ case sizeof(double):
+ u = RV(1.01419718511083373224408e-05);
+ u = mad(u, s, RV(-2.59519791585924697698614e-05));
+ u = mad(u, s, RV(5.23388081915899855325186e-05));
+ u = mad(u, s, RV(-3.05033014433946488225616e-05));
+ u = mad(u, s, RV(7.14707504084242744267497e-05));
+ u = mad(u, s, RV(8.09674518280159187045078e-05));
+ u = mad(u, s, RV(0.000244884931879331847054404));
+ u = mad(u, s, RV(0.000588505168743587154904506));
+ u = mad(u, s, RV(0.00145612788922812427978848));
+ u = mad(u, s, RV(0.00359208743836906619142924));
+ u = mad(u, s, RV(0.00886323944362401618113356));
+ u = mad(u, s, RV(0.0218694882853846389592078));
+ u = mad(u, s, RV(0.0539682539781298417636002));
+ u = mad(u, s, RV(0.133333333333125941821962));
+ u = mad(u, s, RV(0.333333333333334980164153));
+ break;
}
-
+
+ u = mad(s, u * x, x);
+
+ u = ifthen(convert_bool(iq & IV(I(1))), rcp(u), u);
+
+ const real_t nan = std::numeric_limits<real_t>::quiet_NaN();
+ u = ifthen(isinf(d), RV(nan), u);
+
+ return u;
+}
+
}; // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_SIN_H
+#endif // #ifndef MATHFUNCS_SIN_H
diff --git a/mathfuncs_sinh.h b/mathfuncs_sinh.h
index 04aa446..a8c2ee3 100644
--- a/mathfuncs_sinh.h
+++ b/mathfuncs_sinh.h
@@ -7,28 +7,23 @@
#include <cmath>
+namespace vecmathlib {
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_cosh(realvec_t x) {
+ return RV(0.5) * (exp(x) + exp(-x));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_sinh(realvec_t x) {
+ return RV(0.5) * (exp(x) - exp(-x));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_tanh(realvec_t x) {
+ return sinh(x) / cosh(x);
+}
-namespace vecmathlib {
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_cosh(realvec_t x)
- {
- return RV(0.5) * (exp(x) + exp(-x));
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_sinh(realvec_t x)
- {
- return RV(0.5) * (exp(x) - exp(-x));
- }
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_tanh(realvec_t x)
- {
- return sinh(x) / cosh(x);
- }
-
}; // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_SINH_H
+#endif // #ifndef MATHFUNCS_SINH_H
diff --git a/mathfuncs_sqrt.h b/mathfuncs_sqrt.h
index dea5fd6..7a362f9 100644
--- a/mathfuncs_sqrt.h
+++ b/mathfuncs_sqrt.h
@@ -7,13 +7,10 @@
#include <cmath>
-
-
namespace vecmathlib {
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x)
- {
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x) {
#if 0
// Handle special case: zero
boolvec_t is_zero = x <= RV(0.0);
@@ -49,29 +46,23 @@ namespace vecmathlib {
// Handle special case: zero
r = ifthen(is_zero, RV(0.0), r);
#endif
-
- realvec_t r = x * rsqrt(x);
- // Handle special case: zero
- r = ifthen(x == RV(0.0), RV(0.0), r);
-
- return r;
- }
-
-
-
- // TODO: Use "Halley's method with cubic convergence":
- // <http://press.mcs.anl.gov/gswjanuary12/files/2012/01/Optimizing-Single-Node-Performance-on-BlueGene.pdf>
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_cbrt(realvec_t x)
- {
- return pow(x, RV(1.0/3.0));
- }
-
-
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x)
- {
+
+ realvec_t r = x * rsqrt(x);
+ // Handle special case: zero
+ r = ifthen(x == RV(0.0), RV(0.0), r);
+
+ return r;
+}
+
+// TODO: Use "Halley's method with cubic convergence":
+// <http://press.mcs.anl.gov/gswjanuary12/files/2012/01/Optimizing-Single-Node-Performance-on-BlueGene.pdf>
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_cbrt(realvec_t x) {
+ return pow(x, RV(1.0 / 3.0));
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x) {
#if 0
// See <http://en.wikipedia.org/wiki/Fast_inverse_square_root>
realvec_t x_2 = RV(0.5) * x;
@@ -85,46 +76,43 @@ namespace vecmathlib {
r += r * (RV(0.5) - (x_2 * r * r));
return r;
#else
- // Initial guess
- // VML_ASSERT(all(x > RV(0.0)));
- intvec_t ilogb_x = ilogb(x);
- realvec_t s =
+ // Initial guess
+ // VML_ASSERT(all(x > RV(0.0)));
+ intvec_t ilogb_x = ilogb(x);
+ realvec_t s =
ifthen(convert_bool(ilogb_x & IV(I(1))), RV(R(0.583)), RV(R(0.824)));
- realvec_t r = ldexp(s, -(ilogb_x >> I(1)));
-
- realvec_t x_2 = RV(0.5) * x;
-
- // Iterate
- // nmax iterations give an accuracy of 2^nmax binary digits. 5
- // iterations suffice for double precision with its 53 digits.
- int const nmax = sizeof(real_t)==4 ? 4 : 5;
- for (int n=0; n<nmax; ++n) {
- // Step
- VML_ASSERT(all(r > RV(0.0)));
- // Newton method:
- // Solve f(r) = 0 for f(r) = x - 1/r^2
- // r <- r - f(r) / f'(r)
- // r <- (3 r - r^3 x) / 2
- // r <- r (3/2 - r^2 x/2)
-
- // Note: don't rewrite this expression, this may introduce
- // cancellation errors (says who?)
- // r *= RV(1.5) - x_2 * r*r;
- r += r * (RV(0.5) - x_2 * r*r);
- }
-
- return r;
-#endif
- }
-
-
-
- template<typename realvec_t>
- realvec_t mathfuncs<realvec_t>::vml_hypot(realvec_t x, realvec_t y)
- {
- return sqrt(x*x + y*y);
+ realvec_t r = ldexp(s, -(ilogb_x >> I(1)));
+
+ realvec_t x_2 = RV(0.5) * x;
+
+ // Iterate
+ // nmax iterations give an accuracy of 2^nmax binary digits. 5
+ // iterations suffice for double precision with its 53 digits.
+ int const nmax = sizeof(real_t) == 4 ? 4 : 5;
+ for (int n = 0; n < nmax; ++n) {
+ // Step
+ VML_ASSERT(all(r > RV(0.0)));
+ // Newton method:
+ // Solve f(r) = 0 for f(r) = x - 1/r^2
+ // r <- r - f(r) / f'(r)
+ // r <- (3 r - r^3 x) / 2
+ // r <- r (3/2 - r^2 x/2)
+
+ // Note: don't rewrite this expression, this may introduce
+ // cancellation errors (says who?)
+ // r *= RV(1.5) - x_2 * r*r;
+ r += r * (RV(0.5) - x_2 * r * r);
}
-
+
+ return r;
+#endif
+}
+
+template <typename realvec_t>
+realvec_t mathfuncs<realvec_t>::vml_hypot(realvec_t x, realvec_t y) {
+ return sqrt(x * x + y * y);
+}
+
}; // namespace vecmathlib
-#endif // #ifndef MATHFUNCS_SQRT_H
+#endif // #ifndef MATHFUNCS_SQRT_H
diff --git a/selftest.cc b/selftest.cc
index 4296f14..334d95f 100644
--- a/selftest.cc
+++ b/selftest.cc
@@ -14,22 +14,17 @@
using namespace std;
-
-
int num_errors = 0;
+template <typename realvec_t> struct vecmathlib_test {
-
-template<typename realvec_t>
-struct vecmathlib_test {
-
typedef typename realvec_t::boolvec_t boolvec_t;
typedef typename realvec_t::intvec_t intvec_t;
-
+
typedef typename realvec_t::int_t int_t;
typedef typename realvec_t::uint_t uint_t;
typedef typename realvec_t::real_t real_t;
-
+
// Short names for type casts
typedef real_t R;
typedef int_t I;
@@ -37,16 +32,13 @@ struct vecmathlib_test {
typedef realvec_t RV;
typedef intvec_t IV;
typedef boolvec_t BV;
-
+
typedef vecmathlib::floatprops<real_t> FP;
typedef vecmathlib::mathfuncs<realvec_t> MF;
-
-
-
+
// Test each function with this many random values
static const int imax = 10000;
- static real_t accuracy(real_t ulp = R(0.5))
- {
+ static real_t accuracy(real_t ulp = R(0.5)) {
#ifdef VML_HAVE_FP_CONTRACT
// Require that 100% of the digits are correct
// real_t digit_fraction = 1.0;
@@ -56,526 +48,451 @@ struct vecmathlib_test {
// Require that 80% of the digits are correct
real_t digit_fraction = 0.8;
#endif
- digit_fraction *= 0.95; // some lenience for testing (why?)
+ digit_fraction *= 0.95; // some lenience for testing (why?)
return pow(ulp * realvec_t::epsilon(), digit_fraction);
}
-
-
-
- static realvec_t random(const real_t xmin, const real_t xmax)
- {
+
+ static realvec_t random(const real_t xmin, const real_t xmax) {
realvec_t x;
- for (int i=0; i<realvec_t::size; ++i) {
- const real_t r =
- (xmax - xmin) * FP::convert_float(rand()) / FP::convert_float(RAND_MAX);
+ for (int i = 0; i < realvec_t::size; ++i) {
+ const real_t r = (xmax - xmin) * FP::convert_float(rand()) /
+ FP::convert_float(RAND_MAX);
x.set_elt(i, xmin + r);
}
return x;
}
-
- static intvec_t random(const int_t nmin, const int_t nmax)
- {
+
+ static intvec_t random(const int_t nmin, const int_t nmax) {
intvec_t n;
- for (int i=0; i<intvec_t::size; ++i) {
- const real_t r =
- R(nmax - nmin + 1) * R(rand()) / (R(RAND_MAX) + R(1.0));
+ for (int i = 0; i < intvec_t::size; ++i) {
+ const real_t r = R(nmax - nmin + 1) * R(rand()) / (R(RAND_MAX) + R(1.0));
n.set_elt(i, nmin + FP::convert_int(floor(r)));
}
return n;
}
-
-
-
- static bool is_big_endian()
- {
+
+ static bool is_big_endian() {
const int i = 1;
unsigned char cs[sizeof i];
memcpy(cs, &i, sizeof i);
- return cs[0]==0;
+ return cs[0] == 0;
}
-
- template<typename T>
- static string hex(const T x)
- {
+
+ template <typename T> static string hex(const T x) {
unsigned char cs[sizeof x];
memcpy(cs, &x, sizeof x);
ostringstream buf;
buf << "0x";
- const char* const hexdigits = "0123456789abcdef";
+ const char *const hexdigits = "0123456789abcdef";
const int n0 = is_big_endian() ? 0 : sizeof x - 1;
const int dn = is_big_endian() ? +1 : -1;
const int n1 = n0 + sizeof x * dn;
- for (int n=n0; n!=n1; n+=dn) {
- buf << hexdigits[cs[n]>>4] << hexdigits[cs[n]&15];
+ for (int n = n0; n != n1; n += dn) {
+ buf << hexdigits[cs[n] >> 4] << hexdigits[cs[n] & 15];
}
return buf.str();
}
-
-
-
- static boolvec_t supported(realvec_t x)
- {
- return x==RV(0.0) || MF::vml_ieee_isnormal(x)
+
+ static boolvec_t supported(realvec_t x) {
+ return x == RV(0.0) || MF::vml_ieee_isnormal(x)
#ifdef VML_HAVE_DENORMALS
- || MF::vml_ieee_isfinite(x)
+ || MF::vml_ieee_isfinite(x)
#endif
#ifdef VML_HAVE_INF
- || MF::vml_ieee_isinf(x)
+ || MF::vml_ieee_isinf(x)
#endif
#ifdef VML_HAVE_NAN
- || MF::vml_ieee_isnan(x)
+ || MF::vml_ieee_isnan(x)
#endif
- ;
- }
-
- static boolvec_t supported(intvec_t x)
- {
- return true;
- }
-
- static boolvec_t supported(boolvec_t x)
- {
- return true;
+ ;
}
-
-
-
+
+ static boolvec_t supported(intvec_t x) { return true; }
+
+ static boolvec_t supported(boolvec_t x) { return true; }
+
// Check load memory access
- static void check_mem(const char* const func,
- const realvec_t x,
- const real_t* const p,
- const realvec_t xold,
- const int mval)
- {
+ static void check_mem(const char *const func, const realvec_t x,
+ const real_t *const p, const realvec_t xold,
+ const int mval) {
realvec_t xwant;
- for (int i=0; i<realvec_t::size; ++i) {
- xwant.set_elt(i, mval & (1<<i) ? p[i] : xold[i]);
+ for (int i = 0; i < realvec_t::size; ++i) {
+ xwant.set_elt(i, mval & (1 << i) ? p[i] : xold[i]);
}
const boolvec_t isbad = x != xwant;
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " found=" << x << " [" << hex(x) << "]\n"
<< " expected=" << xwant << " [" << hex(xwant) << "]\n"
<< " mval=" << mval << " [" << hex(mval) << "]\n"
- << " isbad=" << isbad << "\n"
- << flush;
+ << " isbad=" << isbad << "\n" << flush;
}
}
-
+
// Check store memory access
- static void check_mem(const char* const func,
- const real_t* const p,
- const realvec_t x,
- const real_t* const pold,
- const int mval)
- {
+ static void check_mem(const char *const func, const real_t *const p,
+ const realvec_t x, const real_t *const pold,
+ const int mval) {
realvec_t pv, pvwant;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
pv.set_elt(i, p[i]);
- pvwant.set_elt(i, mval & (1<<i) ? x[i] : pold[i]);
+ pvwant.set_elt(i, mval & (1 << i) ? x[i] : pold[i]);
}
const boolvec_t isbad = pv != pvwant;
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " found=" << pv << " [" << hex(pv) << "]\n"
<< " expected=" << pvwant << " [" << hex(pvwant) << "]\n"
- << " isbad=" << isbad << "\n"
- << flush;
+ << " isbad=" << isbad << "\n" << flush;
}
}
-
- static void check_bool(const char* const func,
- const bool rstd, const bool rvml)
- {
+
+ static void check_bool(const char *const func, const bool rstd,
+ const bool rvml) {
const bool dr = rstd ^ rvml;
const bool isbad = dr;
if (isbad) {
- ++ num_errors;
+ ++num_errors;
cout << "Error in " << func << ":\n"
<< " fstd()=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml()=" << rvml << " [" << hex(rvml) << "]\n"
- << " isbad()=" << isbad << "\n"
- << flush;
+ << " isbad()=" << isbad << "\n" << flush;
}
}
-
- template<typename A>
- static void check_bool(const char* const func,
- const bool rstd, const bool rvml, const A x)
- {
+
+ template <typename A>
+ static void check_bool(const char *const func, const bool rstd,
+ const bool rvml, const A x) {
const bool dr = rstd ^ rvml;
const bool isbad = dr;
if (isbad) {
- ++ num_errors;
+ ++num_errors;
cout << "Error in " << func << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
- << " isbad(x)=" << isbad << "\n"
- << flush;
+ << " isbad(x)=" << isbad << "\n" << flush;
}
}
-
- template<typename A>
- static void check_bool(const char* const func,
- const boolvec_t rstd, const boolvec_t rvml,
- const A x)
- {
+
+ template <typename A>
+ static void check_bool(const char *const func, const boolvec_t rstd,
+ const boolvec_t rvml, const A x) {
boolvec_t dr;
bool isbad = false;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
dr.set_elt(i, rstd[i] ^ rvml[i]);
isbad |= dr[i];
}
if (isbad) {
- ++ num_errors;
+ ++num_errors;
cout << "Error in " << func << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
<< " error(x)=" << dr << " [" << hex(rvml) << "]\n"
- << " isbad(x)=" << isbad << "\n"
- << flush;
+ << " isbad(x)=" << isbad << "\n" << flush;
}
}
-
- template<typename A, typename B>
- static void check_bool(const char* const func,
- const boolvec_t rstd, const boolvec_t rvml,
- const A x, const B y)
- {
+
+ template <typename A, typename B>
+ static void check_bool(const char *const func, const boolvec_t rstd,
+ const boolvec_t rvml, const A x, const B y) {
boolvec_t dr;
bool isbad = false;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
dr.set_elt(i, rstd[i] ^ rvml[i]);
isbad |= dr[i];
}
if (isbad) {
- ++ num_errors;
+ ++num_errors;
cout << "Error in " << func << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " y=" << y << " [" << hex(y) << "]\n"
<< " fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
<< " error(x,y)=" << dr << " [" << hex(rvml) << "]\n"
- << " isbad(x,y)=" << isbad << "\n"
- << flush;
+ << " isbad(x,y)=" << isbad << "\n" << flush;
}
}
-
- template<typename A>
- static void check_bool(const char* const func,
- bool fstd(typename A::scalar_t x),
- boolvec_t fvml(A x),
- const A x)
- {
+
+ template <typename A>
+ static void check_bool(const char *const func,
+ bool fstd(typename A::scalar_t x), boolvec_t fvml(A x),
+ const A x) {
boolvec_t rstd;
- for (int i=0; i<boolvec_t::size; ++i) {
+ for (int i = 0; i < boolvec_t::size; ++i) {
rstd.set_elt(i, fstd(x[i]));
}
const boolvec_t rvml = fvml(x);
const boolvec_t dr = rstd != rvml;
const boolvec_t isbad = supported(x) && supported(rstd) && dr;
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
<< " error(x)=" << dr << " [" << hex(dr) << "]\n"
- << " isbad(x)=" << isbad << "\n"
- << flush;
+ << " isbad(x)=" << isbad << "\n" << flush;
}
}
-
- template<typename A, typename B>
- static void check_bool(const char* const func,
+
+ template <typename A, typename B>
+ static void check_bool(const char *const func,
bool fstd(typename A::scalar_t x,
typename B::scalar_t y),
- boolvec_t fvml(A x, B y),
- const A x, const B y)
- {
+ boolvec_t fvml(A x, B y), const A x, const B y) {
boolvec_t rstd;
- for (int i=0; i<boolvec_t::size; ++i) {
+ for (int i = 0; i < boolvec_t::size; ++i) {
rstd.set_elt(i, fstd(x[i], y[i]));
}
const boolvec_t rvml = fvml(x, y);
const boolvec_t dr = rstd != rvml;
const boolvec_t isbad = supported(x) && supported(rstd) && dr;
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " y=" << y << " [" << hex(y) << "]\n"
<< " fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
<< " error(x,y)=" << dr << " [" << hex(dr) << "]\n"
- << " isbad(x,y)=" << isbad << "\n"
- << flush;
+ << " isbad(x,y)=" << isbad << "\n" << flush;
}
}
-
- template<typename A, typename B, typename C>
- static void check_bool(const char* const func,
- bool fstd(typename A::scalar_t x,
- typename B::scalar_t y,
- typename C::scalar_t z),
- boolvec_t fvml(A x, B y, C z),
- const A x, const B y, const C z)
- {
+
+ template <typename A, typename B, typename C>
+ static void
+ check_bool(const char *const func,
+ bool fstd(typename A::scalar_t x, typename B::scalar_t y,
+ typename C::scalar_t z),
+ boolvec_t fvml(A x, B y, C z), const A x, const B y, const C z) {
boolvec_t rstd;
- for (int i=0; i<boolvec_t::size; ++i) {
+ for (int i = 0; i < boolvec_t::size; ++i) {
rstd.set_elt(i, fstd(x[i], y[i], z[i]));
}
const boolvec_t rvml = fvml(x, y, z);
const boolvec_t dr = rstd != rvml;
const boolvec_t isbad = supported(x) && supported(rstd) && dr;
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " y=" << y << " [" << hex(y) << "]\n"
<< " z=" << z << " [" << hex(z) << "]\n"
<< " fstd(x,y,z)=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml(x,y,z)=" << rvml << " [" << hex(rvml) << "]\n"
<< " error(x,y,z)=" << dr << " [" << hex(dr) << "]\n"
- << " isbad(x,y,z)=" << isbad << "\n"
- << flush;
+ << " isbad(x,y,z)=" << isbad << "\n" << flush;
}
}
-
- static void check_int(const char* const func,
- const int_t rstd, const int_t rvml)
- {
+
+ static void check_int(const char *const func, const int_t rstd,
+ const int_t rvml) {
const int_t dr = rstd - rvml;
const bool isbad = dr;
if (isbad) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " fstd()=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml()=" << rvml << " [" << hex(rvml) << "]\n"
<< " error()=" << dr << " [" << hex(dr) << "]\n"
- << " isbad()=" << isbad << "\n"
- << flush;
+ << " isbad()=" << isbad << "\n" << flush;
}
}
-
- template<typename A>
- static void check_int(const char* const func,
- int_t fstd(typename A::scalar_t x),
- intvec_t fvml(A x),
- const A x)
- {
+
+ template <typename A>
+ static void check_int(const char *const func,
+ int_t fstd(typename A::scalar_t x), intvec_t fvml(A x),
+ const A x) {
intvec_t rstd;
- for (int i=0; i<intvec_t::size; ++i) {
+ for (int i = 0; i < intvec_t::size; ++i) {
rstd.set_elt(i, fstd(x[i]));
}
const intvec_t rvml = fvml(x);
const intvec_t dr = rstd - rvml;
const boolvec_t isbad = supported(x) && supported(rstd) && convert_bool(dr);
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
<< " error(x)=" << dr << " [" << hex(dr) << "]\n"
- << " isbad(x)=" << isbad << "\n"
- << flush;
+ << " isbad(x)=" << isbad << "\n" << flush;
}
}
-
- template<typename A, typename B>
- static void check_int(const char* const func,
+
+ template <typename A, typename B>
+ static void check_int(const char *const func,
int_t fstd(typename A::scalar_t x, B y),
- intvec_t fvml(A x, B y),
- const A x, const B y)
- {
+ intvec_t fvml(A x, B y), const A x, const B y) {
intvec_t rstd;
- for (int i=0; i<intvec_t::size; ++i) {
+ for (int i = 0; i < intvec_t::size; ++i) {
rstd.set_elt(i, fstd(x[i], y));
}
const intvec_t rvml = fvml(x, y);
const intvec_t dr = rstd - rvml;
const boolvec_t isbad = supported(x) && supported(rstd) && convert_bool(dr);
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " y=" << y << " [" << hex(y) << "]\n"
<< " fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
<< " error(x,y)=" << dr << " [" << hex(dr) << "]\n"
- << " isbad(x,y)=" << isbad << "\n"
- << flush;
+ << " isbad(x,y)=" << isbad << "\n" << flush;
}
}
-
- template<typename A, typename B>
- static void check_int(const char* const func,
+
+ template <typename A, typename B>
+ static void check_int(const char *const func,
int_t fstd(typename A::scalar_t x,
typename B::scalar_t y),
- intvec_t fvml(A x, B y),
- const A x, const B y)
- {
+ intvec_t fvml(A x, B y), const A x, const B y) {
intvec_t rstd;
- for (int i=0; i<intvec_t::size; ++i) {
+ for (int i = 0; i < intvec_t::size; ++i) {
rstd.set_elt(i, fstd(x[i], y[i]));
}
const intvec_t rvml = fvml(x, y);
const intvec_t dr = rstd - rvml;
const boolvec_t isbad =
- supported(x) && supported(y) && supported(rstd) && convert_bool(dr);
+ supported(x) && supported(y) && supported(rstd) && convert_bool(dr);
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " y=" << y << " [" << hex(y) << "]\n"
<< " fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n"
<< " error(x,y)=" << dr << " [" << hex(dr) << "]\n"
- << " isbad(x,y)=" << isbad << "\n"
- << flush;
+ << " isbad(x,y)=" << isbad << "\n" << flush;
}
}
-
- template<typename A, typename B, typename C>
- static void check_int(const char* const func,
- int_t fstd(typename A::scalar_t x,
- typename B::scalar_t y,
- typename C::scalar_t z),
- intvec_t fvml(A x, B y, C z),
- const A x, const B y, const C z)
- {
+
+ template <typename A, typename B, typename C>
+ static void
+ check_int(const char *const func,
+ int_t fstd(typename A::scalar_t x, typename B::scalar_t y,
+ typename C::scalar_t z),
+ intvec_t fvml(A x, B y, C z), const A x, const B y, const C z) {
intvec_t rstd;
- for (int i=0; i<intvec_t::size; ++i) {
+ for (int i = 0; i < intvec_t::size; ++i) {
rstd.set_elt(i, fstd(x[i], y[i], z[i]));
}
const intvec_t rvml = fvml(x, y, z);
const intvec_t dr = rstd - rvml;
- const boolvec_t isbad =
- supported(x) && supported(y) && supported(z) && supported(rstd) &&
- convert_bool(dr);
+ const boolvec_t isbad = supported(x) && supported(y) && supported(z) &&
+ supported(rstd) && convert_bool(dr);
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " y=" << y << " [" << hex(y) << "]\n"
<< " z=" << z << " [" << hex(z) << "]\n"
<< " fstd(x,y,z)=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml(x,y,z)=" << rvml << " [" << hex(rvml) << "]\n"
<< " error(x,y,z)=" << dr << " [" << hex(dr) << "]\n"
- << " isbad(x,y,z)=" << isbad << "\n"
- << flush;
+ << " isbad(x,y,z)=" << isbad << "\n" << flush;
}
}
-
- static void check_real(const char* const func,
- const real_t rstd, const real_t rvml)
- {
+
+ static void check_real(const char *const func, const real_t rstd,
+ const real_t rvml) {
const real_t dr = rstd - rvml;
const bool isbad = dr != R(0.0);
if (isbad) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << "():\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << "():\n"
<< " fstd()=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml()=" << rvml << " [" << hex(rvml) << "]\n"
<< " error()=" << dr << "\n"
- << " isbad()=" << isbad << "\n"
- << flush;
+ << " isbad()=" << isbad << "\n" << flush;
}
}
-
- template<typename A>
- static void check_real(const char* const func,
- const real_t rstd, const real_t rvml, const A x,
- const real_t accuracy)
- {
+
+ template <typename A>
+ static void check_real(const char *const func, const real_t rstd,
+ const real_t rvml, const A x, const real_t accuracy) {
const real_t dr = rstd - rvml;
real_t maxabs = 0.0;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
maxabs = vml_std::fmax(maxabs, vml_std::fabs(x[i]));
}
const real_t scale = fabs(rstd) + fabs(rvml) + fabs(maxabs) + R(1.0);
const bool isbad = fabs(dr) > accuracy * scale;
if (isbad) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << "():\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << "():\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
<< " error(x)=" << dr << "\n"
- << " isbad(x)=" << isbad << "\n"
- << flush;
+ << " isbad(x)=" << isbad << "\n" << flush;
}
}
-
- template<typename A>
- static void check_real(const char* const func,
- real_t fstd(typename A::scalar_t x),
- realvec_t fvml(A x),
- const A x,
- const real_t accuracy)
- {
+
+ template <typename A>
+ static void
+ check_real(const char *const func, real_t fstd(typename A::scalar_t x),
+ realvec_t fvml(A x), const A x, const real_t accuracy) {
realvec_t rstd;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
rstd.set_elt(i, fstd(x[i]));
}
const realvec_t rvml = fvml(x);
const realvec_t dr = rstd - rvml;
const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0);
- const boolvec_t isbad =
- supported(x) && supported(rstd) &&
- fabs(dr) > realvec_t(accuracy) * scale;
+ const boolvec_t isbad = supported(x) && supported(rstd) &&
+ fabs(dr) > realvec_t(accuracy) * scale;
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " fstd(x)=" << rstd << " [" << hex(rstd) << "]\n"
<< " fvml(x)=" << rvml << " [" << hex(rvml) << "]\n"
<< " abs-error(x)=" << fabs(dr) << "\n"
<< " rel-error(x)=" << fabs(dr) / scale << "\n"
<< " isbad(x)=" << isbad << "\n"
- << " accuracy=" << accuracy << "\n"
- << flush;
+ << " accuracy=" << accuracy << "\n" << flush;
}
}
-
- template<typename A, typename B>
- static void check_real(const char* const func,
+
+ template <typename A, typename B>
+ static void check_real(const char *const func,
real_t fstd(typename A::scalar_t x, B y),
- realvec_t fvml(A x, B y),
- const A x, const B y,
- const real_t accuracy)
- {
+ realvec_t fvml(A x, B y), const A x, const B y,
+ const real_t accuracy) {
realvec_t rstd;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
rstd.set_elt(i, fstd(x[i], y));
}
const realvec_t rvml = fvml(x, y);
const realvec_t dr = rstd - rvml;
const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0);
- const boolvec_t isbad =
- supported(x) && supported(rstd) && fabs(dr) > realvec_t(accuracy) * scale;
+ const boolvec_t isbad = supported(x) && supported(rstd) &&
+ fabs(dr) > realvec_t(accuracy) * scale;
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " y=" << y << " [" << hex(y) << "]\n"
<< " fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
@@ -583,38 +500,32 @@ struct vecmathlib_test {
<< " abs-error(x,y)=" << fabs(dr) << "\n"
<< " rel-error(x,y)=" << fabs(dr) / scale << "\n"
<< " isbad(x,y)=" << isbad << "\n"
- << " accuracy=" << accuracy << "\n"
- << flush;
+ << " accuracy=" << accuracy << "\n" << flush;
}
}
-
- template<typename A, typename B>
- static void check_real(const char* const func,
- real_t fstd(typename A::scalar_t x,
- typename B::scalar_t y),
- realvec_t fvml(A x, B y),
- const A x, const B y,
- const real_t accuracy,
- const realvec_t offset = RV(0.0))
- {
+
+ template <typename A, typename B>
+ static void
+ check_real(const char *const func,
+ real_t fstd(typename A::scalar_t x, typename B::scalar_t y),
+ realvec_t fvml(A x, B y), const A x, const B y,
+ const real_t accuracy, const realvec_t offset = RV(0.0)) {
realvec_t rstd;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
rstd.set_elt(i, fstd(x[i], y[i]));
}
realvec_t rvml = fvml(x, y);
// Fix up rvml by adding/subtracting the offset
- rvml = ifthen(fabs(rstd-rvml)>fabs(offset/RV(2.0)),
- rvml + copysign(offset, rstd-rvml),
- rvml);
+ rvml = ifthen(fabs(rstd - rvml) > fabs(offset / RV(2.0)),
+ rvml + copysign(offset, rstd - rvml), rvml);
const realvec_t dr = rstd - rvml;
const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0);
- const boolvec_t isbad =
- supported(x) && supported(y) && supported(rstd) &&
- fabs(dr) > realvec_t(accuracy) * scale;
+ const boolvec_t isbad = supported(x) && supported(y) && supported(rstd) &&
+ fabs(dr) > realvec_t(accuracy) * scale;
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " y=" << y << " [" << hex(y) << "]\n"
<< " fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n"
@@ -622,34 +533,31 @@ struct vecmathlib_test {
<< " abs-error(x,y)=" << fabs(dr) << "\n"
<< " rel-error(x,y)=" << fabs(dr) / scale << "\n"
<< " isbad(x,y)=" << isbad << "\n"
- << " accuracy=" << accuracy << "\n"
- << flush;
+ << " accuracy=" << accuracy << "\n" << flush;
}
}
-
- template<typename A, typename B, typename C>
- static void check_real(const char* const func,
+
+ template <typename A, typename B, typename C>
+ static void check_real(const char *const func,
real_t fstd(typename A::scalar_t x,
typename B::scalar_t y,
typename C::scalar_t z),
- realvec_t fvml(A x, B y, C z),
- const A x, const B y, C const z,
- const real_t accuracy)
- {
+ realvec_t fvml(A x, B y, C z), const A x, const B y,
+ C const z, const real_t accuracy) {
realvec_t rstd;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
rstd.set_elt(i, fstd(x[i], y[i], z[i]));
}
const realvec_t rvml = fvml(x, y, z);
const realvec_t dr = rstd - rvml;
const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0);
- const boolvec_t isbad =
- supported(x) && supported(y) && supported(z) && supported(rstd) &&
- fabs(dr) > realvec_t(accuracy) * scale;
+ const boolvec_t isbad = supported(x) && supported(y) && supported(z) &&
+ supported(rstd) &&
+ fabs(dr) > realvec_t(accuracy) * scale;
if (any(isbad)) {
- ++ num_errors;
- cout << setprecision(realvec_t::digits10+2)
- << "Error in " << func << ":\n"
+ ++num_errors;
+ cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func
+ << ":\n"
<< " x=" << x << " [" << hex(x) << "]\n"
<< " y=" << y << " [" << hex(y) << "]\n"
<< " z=" << z << " [" << hex(z) << "]\n"
@@ -658,61 +566,57 @@ struct vecmathlib_test {
<< " abs-error(x,y,z)=" << fabs(dr) << "\n"
<< " rel-error(x,y,z)=" << fabs(dr) / scale << "\n"
<< " isbad(x,y,z)=" << isbad << "\n"
- << " accuracy=" << accuracy << "\n"
- << flush;
+ << " accuracy=" << accuracy << "\n" << flush;
}
}
-
-
-
- static real_t* align_mem(real_t* p)
- {
+
+ static real_t *align_mem(real_t *p) {
const ptrdiff_t alignment = sizeof(realvec_t);
- p = (real_t*)((intptr_t(p) + alignment-1) & -alignment);
+ p = (real_t *)((intptr_t(p) + alignment - 1) & -alignment);
assert(intptr_t(p) % alignment == 0);
return p;
}
- static string add_suffix(const char* str, int i)
- {
+ static string add_suffix(const char *str, int i) {
ostringstream buf;
buf << str << "." << i;
return buf.str();
}
- static void test_mem()
- {
- cout << " testing loada loadu storea storeu (errors may lead to segfaults)...\n" << flush;
+ static void test_mem() {
+ cout << " testing loada loadu storea storeu (errors may lead to "
+ "segfaults)...\n"
+ << flush;
const int n = 4;
const int sz = realvec_t::size;
- const int nbytes = n*sz*sizeof(real_t);
- real_t* const x = align_mem(new real_t[(n+1)*sz]);
- real_t* const xnew = align_mem(new real_t[(n+1)*sz]);
- for (int i=0; i<n; ++i) {
+ const int nbytes = n * sz * sizeof(real_t);
+ real_t *const x = align_mem(new real_t[(n + 1) * sz]);
+ real_t *const xnew = align_mem(new real_t[(n + 1) * sz]);
+ for (int i = 0; i < n; ++i) {
realvec_t xv = random(R(-10.0), R(+10.0));
- memcpy(&x[i*sz], &xv, sizeof xv);
+ memcpy(&x[i * sz], &xv, sizeof xv);
}
const realvec_t z = random(R(-10.0), R(+10.0));
-
+
// loada
{
const real_t *p = &x[sz];
realvec_t y = realvec_t::loada(p);
check_mem("loada", y, p, z, ~0);
}
-
+
// loadu
- for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+ for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
const real_t *p = &x[sz];
- realvec_t y = realvec_t::loadu(p+i);
- check_mem(add_suffix("loadu", i).c_str(), y, p+i, z, ~0);
+ realvec_t y = realvec_t::loadu(p + i);
+ check_mem(add_suffix("loadu", i).c_str(), y, p + i, z, ~0);
}
-
+
// loadu(ioff)
- for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+ for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) {
const real_t *p = &x[sz];
realvec_t y = realvec_t::loadu(p, ioff);
- check_mem(add_suffix("loadu(ioff)", ioff).c_str(), y, p+ioff, z, ~0);
+ check_mem(add_suffix("loadu(ioff)", ioff).c_str(), y, p + ioff, z, ~0);
}
-
+
// storea
{
memcpy(xnew, x, nbytes);
@@ -720,50 +624,51 @@ struct vecmathlib_test {
storea(z, p);
check_mem("storea", p, z, &x[sz], ~0);
}
-
+
// storeu
- for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+ for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
memcpy(xnew, x, nbytes);
real_t *p = &xnew[sz];
- storeu(z, p+i);
- check_mem(add_suffix("storeu", i).c_str(), p+i, z, &x[sz+i], ~0);
+ storeu(z, p + i);
+ check_mem(add_suffix("storeu", i).c_str(), p + i, z, &x[sz + i], ~0);
}
-
+
// storeu
- for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+ for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) {
memcpy(xnew, x, nbytes);
real_t *p = &xnew[sz];
storeu(z, p, ioff);
- check_mem(add_suffix("storeu(ioff)", ioff).c_str(),
- p+ioff, z, &x[sz+ioff], ~0);
+ check_mem(add_suffix("storeu(ioff)", ioff).c_str(), p + ioff, z,
+ &x[sz + ioff], ~0);
}
-
- for (int mval=0; mval<(1<<realvec_t::size); ++mval) {
+
+ for (int mval = 0; mval < (1 << realvec_t::size); ++mval) {
boolvec_t mbool;
- for (int i=0; i<realvec_t::size; ++i) mbool.set_elt(i, mval & (1<<i));
+ for (int i = 0; i < realvec_t::size; ++i)
+ mbool.set_elt(i, mval & (1 << i));
typename realvec_t::mask_t mask(mbool);
-
+
// loada(mask)
{
const real_t *p = &x[sz];
realvec_t y = loada(p, z, mask);
check_mem("loada(mask)", y, p, z, mval);
}
-
+
// loadu(mask)
- for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+ for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
const real_t *p = &x[sz];
- realvec_t y = loadu(p+i, z, mask);
- check_mem("loadu(mask)", y, p+i, z, mval);
+ realvec_t y = loadu(p + i, z, mask);
+ check_mem("loadu(mask)", y, p + i, z, mval);
}
-
+
// loadu(ioff, mask)
- for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+ for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) {
const real_t *p = &x[sz];
realvec_t y = loadu(p, ioff, z, mask);
- check_mem("loadu(ioff,mask)", y, p+ioff, z, mval);
+ check_mem("loadu(ioff,mask)", y, p + ioff, z, mval);
}
-
+
// storea
{
memcpy(xnew, x, nbytes);
@@ -771,37 +676,35 @@ struct vecmathlib_test {
storea(z, p, mask);
check_mem("storea(mask)", p, z, &x[sz], mval);
}
-
+
// storeu
- for (ptrdiff_t i=0; i<realvec_t::size; ++i) {
+ for (ptrdiff_t i = 0; i < realvec_t::size; ++i) {
memcpy(xnew, x, nbytes);
real_t *p = &xnew[sz];
- storeu(z, p+i, mask);
- check_mem("storeu(mask)", p+i, z, &x[sz+i], mval);
+ storeu(z, p + i, mask);
+ check_mem("storeu(mask)", p + i, z, &x[sz + i], mval);
}
-
+
// storeu
- for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) {
+ for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) {
memcpy(xnew, x, nbytes);
real_t *p = &xnew[sz];
storeu(z, p, ioff, mask);
- check_mem("storeu(ioff,mask)", p+ioff, z, &x[sz+ioff], mval);
+ check_mem("storeu(ioff,mask)", p + ioff, z, &x[sz + ioff], mval);
}
-
+
} // for mval
}
-
-
-
- template<typename T>
- static T local_ifthen(bool b, T x, T y) { return b ? x : y; }
- static void test_bool()
- {
+
+ template <typename T> static T local_ifthen(bool b, T x, T y) {
+ return b ? x : y;
+ }
+ static void test_bool() {
cout << " testing boolean operations...\n" << flush;
-
+
const boolvec_t bf = boolvec_t(false);
const boolvec_t bt = boolvec_t(true);
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
check_bool("false", false, bf[i]);
check_bool("true", true, bt[i]);
}
@@ -809,32 +712,32 @@ struct vecmathlib_test {
check_bool("all", true, all(bt), true);
check_bool("any", false, any(bf), false);
check_bool("any", true, any(bt), true);
-
+
boolvec_t b0 = bt;
boolvec_t b1 = bf;
- for (int n=0; n<realvec_t::size; ++n) {
+ for (int n = 0; n < realvec_t::size; ++n) {
b0.set_elt(n, false);
b1.set_elt(n, true);
- for (int i=0; i<realvec_t::size; ++i) {
- check_bool("set_elt", i<=n ? false : true, b0[i], false);
- check_bool("set_elt", i<=n ? true : false, b1[i], true);
+ for (int i = 0; i < realvec_t::size; ++i) {
+ check_bool("set_elt", i <= n ? false : true, b0[i], false);
+ check_bool("set_elt", i <= n ? true : false, b1[i], true);
}
}
-
- for (int n=0; n<(1<<realvec_t::size); ++n) {
+
+ for (int n = 0; n < (1 << realvec_t::size); ++n) {
boolvec_t x;
- for (int i=0; i<realvec_t::size; ++i) {
- x.set_elt(i, n & (1<<i));
+ for (int i = 0; i < realvec_t::size; ++i) {
+ x.set_elt(i, n & (1 << i));
}
- for (int i=0; i<realvec_t::size; ++i) {
- bool rstd = n & (1<<i);
+ for (int i = 0; i < realvec_t::size; ++i) {
+ bool rstd = n & (1 << i);
bool rvml = x[i];
check_bool("[]", rstd, rvml, x);
}
-
+
{
boolvec_t rstd;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
rstd.set_elt(i, !x[i]);
}
boolvec_t rvml = !x;
@@ -842,7 +745,7 @@ struct vecmathlib_test {
}
{
bool rstd = x[0];
- for (int i=1; i<realvec_t::size; ++i) {
+ for (int i = 1; i < realvec_t::size; ++i) {
rstd &= x[i];
}
bool rvml = all(x);
@@ -850,39 +753,36 @@ struct vecmathlib_test {
}
{
bool rstd = x[0];
- for (int i=1; i<realvec_t::size; ++i) {
+ for (int i = 1; i < realvec_t::size; ++i) {
rstd |= x[i];
}
bool rvml = any(x);
check_bool("any", rstd, rvml, x);
}
- check_bool
- ("ifthen(bool)",
- local_ifthen<bool>,
- (boolvec_t(*)(boolvec_t,boolvec_t,boolvec_t))vecmathlib::ifthen,
- x, BV(false), BV(true));
- check_int("ifthen(int)",
- local_ifthen<int_t>,
- (intvec_t(*)(boolvec_t,intvec_t,intvec_t))vecmathlib::ifthen,
+ check_bool(
+ "ifthen(bool)", local_ifthen<bool>,
+ (boolvec_t (*)(boolvec_t, boolvec_t, boolvec_t))vecmathlib::ifthen, x,
+ BV(false), BV(true));
+ check_int("ifthen(int)", local_ifthen<int_t>,
+ (intvec_t (*)(boolvec_t, intvec_t, intvec_t))vecmathlib::ifthen,
x, IV(I(1)), IV(I(2)));
- check_real("ifthen(real)",
- local_ifthen<real_t>,
- ((realvec_t(*)(boolvec_t,realvec_t,realvec_t))
- vecmathlib::ifthen),
- x, RV(1.0), RV(2.0), R(0.0));
- }
-
- for (int n=0; n<(1<<realvec_t::size); ++n) {
- for (int m=0; m<(1<<realvec_t::size); ++m) {
+ check_real(
+ "ifthen(real)", local_ifthen<real_t>,
+ ((realvec_t (*)(boolvec_t, realvec_t, realvec_t))vecmathlib::ifthen),
+ x, RV(1.0), RV(2.0), R(0.0));
+ }
+
+ for (int n = 0; n < (1 << realvec_t::size); ++n) {
+ for (int m = 0; m < (1 << realvec_t::size); ++m) {
boolvec_t x, y;
- for (int i=0; i<realvec_t::size; ++i) {
- x.set_elt(i, n & (1<<i));
- y.set_elt(i, m & (1<<i));
+ for (int i = 0; i < realvec_t::size; ++i) {
+ x.set_elt(i, n & (1 << i));
+ y.set_elt(i, m & (1 << i));
}
-
+
{
boolvec_t rstd;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
rstd.set_elt(i, x[i] && y[i]);
}
boolvec_t rvml = x && y;
@@ -890,7 +790,7 @@ struct vecmathlib_test {
}
{
boolvec_t rstd;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
rstd.set_elt(i, x[i] || y[i]);
}
boolvec_t rvml = x || y;
@@ -898,7 +798,7 @@ struct vecmathlib_test {
}
{
boolvec_t rstd;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
rstd.set_elt(i, x[i] == y[i]);
}
boolvec_t rvml = x == y;
@@ -906,7 +806,7 @@ struct vecmathlib_test {
}
{
boolvec_t rstd;
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
rstd.set_elt(i, x[i] != y[i]);
}
boolvec_t rvml = x != y;
@@ -915,322 +815,374 @@ struct vecmathlib_test {
}
}
}
-
-
-
+
static bool local_convert_bool(int_t x) { return x; }
static int_t local_convert_int(bool x) { return x; }
- template<typename T> static T local_pos(T x) { return +x; }
- template<typename T> static T local_neg(T x) { return -x; }
- template<typename T> static T local_not(T x) { return ~x; }
- template<typename T> static T local_add(T x, T y) { return x+y; }
- template<typename T> static T local_sub(T x, T y) { return x-y; }
- template<typename T> static T local_mul(T x, T y) { return x*y; }
- template<typename T> static T local_div(T x, T y) { return x/y; }
- template<typename T> static T local_mod(T x, T y) { return x%y; }
- template<typename T> static T local_and(T x, T y) { return x&y; }
- template<typename T> static T local_or(T x, T y) { return x|y; }
- template<typename T> static T local_xor(T x, T y) { return x^y; }
-
- static int_t local_lsr(int_t x, int_t y) { return uint_t(x)>>uint_t(y); }
- template<typename T> static T local_srs(T x, typename T::scalar_t y)
- {
- return x>>y;
+ template <typename T> static T local_pos(T x) { return +x; }
+ template <typename T> static T local_neg(T x) { return -x; }
+ template <typename T> static T local_not(T x) { return ~x; }
+ template <typename T> static T local_add(T x, T y) { return x + y; }
+ template <typename T> static T local_sub(T x, T y) { return x - y; }
+ template <typename T> static T local_mul(T x, T y) { return x * y; }
+ template <typename T> static T local_div(T x, T y) { return x / y; }
+ template <typename T> static T local_mod(T x, T y) { return x % y; }
+ template <typename T> static T local_and(T x, T y) { return x & y; }
+ template <typename T> static T local_or(T x, T y) { return x | y; }
+ template <typename T> static T local_xor(T x, T y) { return x ^ y; }
+
+ static int_t local_lsr(int_t x, int_t y) { return uint_t(x) >> uint_t(y); }
+ template <typename T> static T local_srs(T x, typename T::scalar_t y) {
+ return x >> y;
}
- template<typename T> static T local_sls(T x, typename T::scalar_t y)
- {
- return x<<y;
+ template <typename T> static T local_sls(T x, typename T::scalar_t y) {
+ return x << y;
}
- template<typename T> static T local_sr(T x, T y) { return x>>y; }
- template<typename T> static T local_sl(T x, T y) { return x<<y; }
-
- template<typename T> static bool local_isignbit(T x) { return x<0; }
- template<typename T> static bool local_eq(T x, T y) { return x==y; }
- template<typename T> static bool local_ne(T x, T y) { return x!=y; }
- template<typename T> static bool local_lt(T x, T y) { return x<y; }
- template<typename T> static bool local_le(T x, T y) { return x<=y; }
- template<typename T> static bool local_gt(T x, T y) { return x>y; }
- template<typename T> static bool local_ge(T x, T y) { return x>=y; }
- template<typename T> static boolvec_t local_veq(T x, T y) { return x==y; }
- template<typename T> static boolvec_t local_vne(T x, T y) { return x!=y; }
- template<typename T> static boolvec_t local_vlt(T x, T y) { return x<y; }
- template<typename T> static boolvec_t local_vle(T x, T y) { return x<=y; }
- template<typename T> static boolvec_t local_vgt(T x, T y) { return x>y; }
- template<typename T> static boolvec_t local_vge(T x, T y) { return x>=y; }
- static void test_int()
- {
+ template <typename T> static T local_sr(T x, T y) { return x >> y; }
+ template <typename T> static T local_sl(T x, T y) { return x << y; }
+
+ template <typename T> static bool local_isignbit(T x) { return x < 0; }
+ template <typename T> static bool local_eq(T x, T y) { return x == y; }
+ template <typename T> static bool local_ne(T x, T y) { return x != y; }
+ template <typename T> static bool local_lt(T x, T y) { return x < y; }
+ template <typename T> static bool local_le(T x, T y) { return x <= y; }
+ template <typename T> static bool local_gt(T x, T y) { return x > y; }
+ template <typename T> static bool local_ge(T x, T y) { return x >= y; }
+ template <typename T> static boolvec_t local_veq(T x, T y) { return x == y; }
+ template <typename T> static boolvec_t local_vne(T x, T y) { return x != y; }
+ template <typename T> static boolvec_t local_vlt(T x, T y) { return x < y; }
+ template <typename T> static boolvec_t local_vle(T x, T y) { return x <= y; }
+ template <typename T> static boolvec_t local_vgt(T x, T y) { return x > y; }
+ template <typename T> static boolvec_t local_vge(T x, T y) { return x >= y; }
+ static void test_int() {
cout << " testing integer operations...\n" << flush;
-
+
intvec_t i0 = intvec_t(I(0));
intvec_t i1 = intvec_t(I(1));
intvec_t iiota = intvec_t::iota();
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
check_int("0", 0, i0[i]);
check_int("1", 1, i1[i]);
check_int("iota", i, iiota[i]);
}
-
+
i0 = intvec_t(I(1));
i1 = intvec_t(I(0));
- for (int n=0; n<realvec_t::size; ++n) {
+ for (int n = 0; n < realvec_t::size; ++n) {
i0.set_elt(n, 0);
i1.set_elt(n, 1);
- for (int i=0; i<realvec_t::size; ++i) {
- check_bool("set_elt", i<=n ? 0 : 1, i0[i], 0);
- check_bool("set_elt", i<=n ? 1 : 0, i1[i], 1);
+ for (int i = 0; i < realvec_t::size; ++i) {
+ check_bool("set_elt", i <= n ? 0 : 1, i0[i], 0);
+ check_bool("set_elt", i <= n ? 1 : 0, i1[i], 1);
}
}
-
+
const int_t int_min = std::numeric_limits<int_t>::min();
const int_t int_max = std::numeric_limits<int_t>::max();
const int_t values[] = {
- 0, 1, 2, 3, -1, -2, -3,
- int_min, int_min+1, int_min+2, int_min+3,
- int_max, int_max-1, int_max-2, int_max-3,
+ 0, 1, 2, 3, -1,
+ -2, -3, int_min, int_min + 1, int_min + 2,
+ int_min + 3, int_max, int_max - 1, int_max - 2, int_max - 3,
};
const int nvalues = sizeof values / sizeof *values;
- for (int i=0; i<nvalues*nvalues+2*imax; ++i) {
+ for (int i = 0; i < nvalues * nvalues + 2 * imax; ++i) {
intvec_t x, y;
- if (i<nvalues*nvalues) {
- x = values[i%nvalues];
- y = values[i/nvalues];
- } else if (i<nvalues*nvalues+imax) {
+ if (i < nvalues * nvalues) {
+ x = values[i % nvalues];
+ y = values[i / nvalues];
+ } else if (i < nvalues * nvalues + imax) {
x = random(I(-100), I(+100));
y = random(I(-100), I(+100));
} else {
- x = random(int_min/2, int_max/2);
- y = random(int_min/2, int_max/2);
+ x = random(int_min / 2, int_max / 2);
+ y = random(int_min / 2, int_max / 2);
}
boolvec_t b = convert_bool(random(I(0), I(1)));
-
- check_bool<IV>("convert_bool(int)",
- local_convert_bool, vecmathlib::convert_bool, x);
- check_int<BV>("convert_int(bool)",
- local_convert_int, vecmathlib::convert_int, b);
-
+
+ check_bool<IV>("convert_bool(int)", local_convert_bool,
+ vecmathlib::convert_bool, x);
+ check_int<BV>("convert_int(bool)", local_convert_int,
+ vecmathlib::convert_int, b);
+
check_int<IV>("+", local_pos, local_pos, x);
check_int<IV>("-", local_neg, local_neg, x);
check_int<IV>("~", local_not, local_not, x);
-
- check_int<IV,IV>("+", local_add, local_add, x, y);
- check_int<IV,IV>("-", local_sub, local_sub, x, y);
- check_int<IV,IV>("&", local_and, local_and, x, y);
- check_int<IV,IV>("|", local_or, local_or, x, y);
- check_int<IV,IV>("^", local_xor, local_xor, x, y);
-
- const int_t bits = 8*sizeof(int_t);
- check_int<IV,I>("lsr", local_lsr, vecmathlib::lsr, x, y[0] & (bits-1));
- check_int<IV,I>(">>", local_sr, local_srs, x, y[0] & (bits-1));
- check_int<IV,I>("<<", local_sl, local_sls, x, y[0] & (bits-1));
- check_int<IV,IV>("lsr", local_lsr, vecmathlib::lsr, x, y & IV(bits-1));
- check_int<IV,IV>(">>", local_sr, local_sr, x, y & IV(bits-1));
- check_int<IV,IV>("<<", local_sl, local_sl, x, y & IV(bits-1));
-
+
+ check_int<IV, IV>("+", local_add, local_add, x, y);
+ check_int<IV, IV>("-", local_sub, local_sub, x, y);
+ check_int<IV, IV>("&", local_and, local_and, x, y);
+ check_int<IV, IV>("|", local_or, local_or, x, y);
+ check_int<IV, IV>("^", local_xor, local_xor, x, y);
+
+ const int_t bits = 8 * sizeof(int_t);
+ check_int<IV, I>("lsr", local_lsr, vecmathlib::lsr, x, y[0] & (bits - 1));
+ check_int<IV, I>(">>", local_sr, local_srs, x, y[0] & (bits - 1));
+ check_int<IV, I>("<<", local_sl, local_sls, x, y[0] & (bits - 1));
+ check_int<IV, IV>("lsr", local_lsr, vecmathlib::lsr, x, y & IV(bits - 1));
+ check_int<IV, IV>(">>", local_sr, local_sr, x, y & IV(bits - 1));
+ check_int<IV, IV>("<<", local_sl, local_sl, x, y & IV(bits - 1));
+
check_bool<IV>("isignbit", local_isignbit, vecmathlib::isignbit, x);
- check_bool<IV,IV>("==", local_eq, local_veq, x, y);
- check_bool<IV,IV>("!=", local_ne, local_vne, x, y);
- check_bool<IV,IV>("<", local_lt, local_vlt, x, y);
- check_bool<IV,IV>("<=", local_le, local_vle, x, y);
- check_bool<IV,IV>(">", local_gt, local_vgt, x, y);
- check_bool<IV,IV>(">=", local_ge, local_vge, x, y);
+ check_bool<IV, IV>("==", local_eq, local_veq, x, y);
+ check_bool<IV, IV>("!=", local_ne, local_vne, x, y);
+ check_bool<IV, IV>("<", local_lt, local_vlt, x, y);
+ check_bool<IV, IV>("<=", local_le, local_vle, x, y);
+ check_bool<IV, IV>(">", local_gt, local_vgt, x, y);
+ check_bool<IV, IV>(">=", local_ge, local_vge, x, y);
}
}
-
- static void test_real()
- {
+
+ static void test_real() {
cout << " testing real operations...\n" << flush;
-
+
realvec_t r0 = realvec_t(0.0);
realvec_t r1 = realvec_t(1.0);
- for (int i=0; i<realvec_t::size; ++i) {
+ for (int i = 0; i < realvec_t::size; ++i) {
check_real("0.0", R(0.0), r0[i]);
check_real("1.0", R(1.0), r1[i]);
}
-
+
r0 = realvec_t(1.0);
r1 = realvec_t(0.0);
- for (int n=0; n<realvec_t::size; ++n) {
+ for (int n = 0; n < realvec_t::size; ++n) {
r0.set_elt(n, R(0.0));
r1.set_elt(n, R(1.0));
- for (int i=0; i<realvec_t::size; ++i) {
- check_bool("set_elt", i<=n ? R(0.0) : R(1.0), r0[i], R(0.0));
- check_bool("set_elt", i<=n ? R(1.0) : R(0.0), r1[i], R(1.0));
+ for (int i = 0; i < realvec_t::size; ++i) {
+ check_bool("set_elt", i <= n ? R(0.0) : R(1.0), r0[i], R(0.0));
+ check_bool("set_elt", i <= n ? R(1.0) : R(0.0), r1[i], R(1.0));
}
}
-
+
// barrier
realvec_t rcancel = r1;
rcancel += RV(R(FP::max() / 2));
rcancel.barrier();
rcancel -= RV(R(FP::max() / 2));
check_real("barrier", R(0.0), rcancel[0]);
-
+
// rounding (break ties to even, or break ties away from zero?)
realvec_t rbase = RV(R(1.0));
- rbase += RV(FP::epsilon()/2);
+ rbase += RV(FP::epsilon() / 2);
check_real("flt_rounds", R(1.0), rbase[0]);
rbase = RV(R(1.0) + FP::epsilon());
- rbase += RV(FP::epsilon()/2);
- check_real("flt_rounds", R(1.0) + 2*FP::epsilon(), rbase[0]);
+ rbase += RV(FP::epsilon() / 2);
+ check_real("flt_rounds", R(1.0) + 2 * FP::epsilon(), rbase[0]);
}
-
- static int_t local_bitifthen(int_t x, int_t y, int_t z)
- {
+
+ static int_t local_bitifthen(int_t x, int_t y, int_t z) {
return (x & y) | (~x & z);
}
- static int_t local_clz(int_t x)
- {
+ static int_t local_clz(int_t x) {
int bits = CHAR_BIT * sizeof(x);
int res = 0;
- for (; res<bits; ++res) {
- if (x & (I(1) << (bits-res-1))) break;
+ for (; res < bits; ++res) {
+ if (x & (I(1) << (bits - res - 1)))
+ break;
}
return res;
}
- static int_t local_max(int_t x, int_t y)
- {
- return std::max(x, y);
- }
- static int_t local_min(int_t x, int_t y)
- {
- return std::min(x, y);
- }
- static int_t local_popcount(int_t x)
- {
+ static int_t local_max(int_t x, int_t y) { return std::max(x, y); }
+ static int_t local_min(int_t x, int_t y) { return std::min(x, y); }
+ static int_t local_popcount(int_t x) {
int bits = CHAR_BIT * sizeof(x);
int res = 0;
- for (int d=0; d<bits; ++d) {
- if (x & (I(1) << d)) ++res;
+ for (int d = 0; d < bits; ++d) {
+ if (x & (I(1) << d))
+ ++res;
}
return res;
}
- static int_t local_rotate(int_t x, int_t n)
- {
+ static int_t local_rotate(int_t x, int_t n) {
int_t mask = CHAR_BIT * sizeof(int_t) - 1;
int_t left = x << (n & mask);
int_t right = I(U(x) >> U(-n & mask));
return left | right;
}
- static void test_abs()
- {
- cout << " testing abs bitifthen clz isignbit max min popcount rotate...\n" << flush;
-
- for (int i=0; i<imax; ++i) {
+ static void test_abs() {
+ cout << " testing abs bitifthen clz isignbit max min popcount rotate...\n"
+ << flush;
+
+ for (int i = 0; i < imax; ++i) {
const intvec_t x = random(I(-1000000), I(+1000000));
const intvec_t y = random(I(-1000000), I(+1000000));
const intvec_t z = random(I(-1000000), I(+1000000));
-
+
check_int<IV>("abs", std::abs, vecmathlib::abs, x);
- check_int<IV,IV,IV>("bitifthen",
- local_bitifthen, vecmathlib::bitifthen, x, y, z);
+ check_int<IV, IV, IV>("bitifthen", local_bitifthen, vecmathlib::bitifthen,
+ x, y, z);
check_int<IV>("clz", local_clz, vecmathlib::clz, x);
- check_int<IV,IV>("max", local_max, vecmathlib::max, x, y);
- check_int<IV,IV>("min", local_min, vecmathlib::min, x, y);
+ check_int<IV, IV>("max", local_max, vecmathlib::max, x, y);
+ check_int<IV, IV>("min", local_min, vecmathlib::min, x, y);
check_int<IV>("popcount", local_popcount, vecmathlib::popcount, x);
- check_int<IV,IV>("rotate", local_rotate, vecmathlib::rotate, x, y[0]);
- check_int<IV,IV>("rotate", local_rotate, vecmathlib::rotate, x, y);
+ check_int<IV, IV>("rotate", local_rotate, vecmathlib::rotate, x, y[0]);
+ check_int<IV, IV>("rotate", local_rotate, vecmathlib::rotate, x, y);
}
}
-
+
// Change signature: "int" -> "int_t"
- static real_t local_frexp0(real_t x)
- {
+ static real_t local_frexp0(real_t x) {
int r;
return vml_std::frexp(x, &r);
}
- static int_t local_frexp1(real_t x)
- {
- if (vml_std::isinf(x)) return std::numeric_limits<int_t>::max();
- if (vml_std::isnan(x)) return std::numeric_limits<int_t>::min();
+ static int_t local_frexp1(real_t x) {
+ if (vml_std::isinf(x))
+ return std::numeric_limits<int_t>::max();
+ if (vml_std::isnan(x))
+ return std::numeric_limits<int_t>::min();
int r;
vml_std::frexp(x, &r);
return r;
}
- static realvec_t local_vfrexp0(realvec_t x)
- {
+ static realvec_t local_vfrexp0(realvec_t x) {
intvec_t r;
return vecmathlib::frexp(x, &r);
}
- static intvec_t local_vfrexp1(realvec_t x)
- {
+ static intvec_t local_vfrexp1(realvec_t x) {
intvec_t r;
vecmathlib::frexp(x, &r);
return r;
}
- static int_t local_ilogb(real_t x)
- {
- if (x==R(0.0)) return std::numeric_limits<int_t>::min();
- if (vml_std::isinf(x)) return std::numeric_limits<int_t>::max();
- if (vml_std::isnan(x)) return std::numeric_limits<int_t>::min();
+ static int_t local_ilogb(real_t x) {
+ if (x == R(0.0))
+ return std::numeric_limits<int_t>::min();
+ if (vml_std::isinf(x))
+ return std::numeric_limits<int_t>::max();
+ if (vml_std::isnan(x))
+ return std::numeric_limits<int_t>::min();
return vml_std::ilogb(x);
}
static real_t local_ldexp(real_t x, int_t n) { return ldexp(x, n); }
- static real_t local_mad(real_t x, real_t y, real_t z) { return x*y+z; }
- static void test_fabs()
- {
- cout << " testing + - + - * == != < <= > >= copysign fabs fdim fma fmax fmin frexp ilogb isfinite isinf isnan isnormal ldexp mad nextafter signbit...\n" << flush;
-
+ static real_t local_mad(real_t x, real_t y, real_t z) { return x * y + z; }
+ static void test_fabs() {
+ cout << " testing + - + - * == != < <= > >= copysign fabs fdim fma fmax "
+ "fmin frexp ilogb isfinite isinf isnan isnormal ldexp mad "
+ "nextafter signbit...\n"
+ << flush;
+
const real_t eps = FP::epsilon();
const real_t int_min = R(std::numeric_limits<int_t>::min());
const real_t int_max = R(std::numeric_limits<int_t>::max());
const real_t uint_min = R(std::numeric_limits<uint_t>::min());
const real_t uint_max = R(std::numeric_limits<uint_t>::max());
const real_t values[] = {
- R(+0.0), R(+0.1), R(+0.9), R(+1.0), R(+1.1),
- R(-0.0), R(-0.1), R(-0.9), R(-1.0), R(-1.1),
- R(+0.0)+eps, R(+0.1)+eps, R(+0.9)+eps, R(+1.0)+eps, R(+1.1)+eps,
- R(-0.0)+eps, R(-0.1)+eps, R(-0.9)+eps, R(-1.0)+eps, R(-1.1)+eps,
- R(+0.0)-eps, R(+0.1)-eps, R(+0.9)-eps, R(+1.0)-eps, R(+1.1)-eps,
- R(-0.0)-eps, R(-0.1)-eps, R(-0.9)-eps, R(-1.0)-eps, R(-1.1)-eps,
+ R(+0.0),
+ R(+0.1),
+ R(+0.9),
+ R(+1.0),
+ R(+1.1),
+ R(-0.0),
+ R(-0.1),
+ R(-0.9),
+ R(-1.0),
+ R(-1.1),
+ R(+0.0) + eps,
+ R(+0.1) + eps,
+ R(+0.9) + eps,
+ R(+1.0) + eps,
+ R(+1.1) + eps,
+ R(-0.0) + eps,
+ R(-0.1) + eps,
+ R(-0.9) + eps,
+ R(-1.0) + eps,
+ R(-1.1) + eps,
+ R(+0.0) - eps,
+ R(+0.1) - eps,
+ R(+0.9) - eps,
+ R(+1.0) - eps,
+ R(+1.1) - eps,
+ R(-0.0) - eps,
+ R(-0.1) - eps,
+ R(-0.9) - eps,
+ R(-1.0) - eps,
+ R(-1.1) - eps,
#ifdef VML_HAVE_DENORMALS
- +FP::min(), +FP::min()*(R(1.0)+eps), +FP::min()*R(2.0),
- -FP::min(), -FP::min()*(R(1.0)+eps), -FP::min()*R(2.0),
+ +FP::min(),
+ +FP::min() * (R(1.0) + eps),
+ +FP::min() * R(2.0),
+ -FP::min(),
+ -FP::min() * (R(1.0) + eps),
+ -FP::min() * R(2.0),
#endif
- +FP::max(), +FP::max()*(R(1.0)-eps), +FP::max()*(R(1.0)-R(2.0)*eps),
- -FP::max(), -FP::max()*(R(1.0)-eps), -FP::max()*(R(1.0)-R(2.0)*eps),
- +R(0.5)*FP::max(), +R(0.5)*FP::max()*(R(1.0)+eps),
- -R(0.5)*FP::max(), -R(0.5)*FP::max()*(R(1.0)+eps),
+ +FP::max(),
+ +FP::max() * (R(1.0) - eps),
+ +FP::max() * (R(1.0) - R(2.0) * eps),
+ -FP::max(),
+ -FP::max() * (R(1.0) - eps),
+ -FP::max() * (R(1.0) - R(2.0) * eps),
+ +R(0.5) * FP::max(),
+ +R(0.5) * FP::max() * (R(1.0) + eps),
+ -R(0.5) * FP::max(),
+ -R(0.5) * FP::max() * (R(1.0) + eps),
#ifdef VML_HAVE_INF
- +R(1.0/0.0), // +FP::infinity()
- -R(1.0/0.0), // -FP::infinity()
+ +R(1.0 / 0.0), // +FP::infinity()
+ -R(1.0 / 0.0), // -FP::infinity()
#endif
#ifdef VML_HAVE_NAN
- R(0.0/0.0), // FP::quiet_NaN()
+ R(0.0 / 0.0), // FP::quiet_NaN()
#endif
- +int_min, +int_max, +uint_min, +uint_max,
- -int_min, -int_max, -uint_min, -uint_max,
- +int_min+R(0.1), +int_max+R(0.1), +uint_min+R(0.1), +uint_max+R(0.1),
- -int_min+R(0.1), -int_max+R(0.1), -uint_min+R(0.1), -uint_max+R(0.1),
- +int_min-R(0.1), +int_max-R(0.1), +uint_min-R(0.1), +uint_max-R(0.1),
- -int_min-R(0.1), -int_max-R(0.1), -uint_min-R(0.1), -uint_max-R(0.1),
- +int_min+R(1.0), +int_max+R(1.0), +uint_min+R(1.0), +uint_max+R(1.0),
- -int_min+R(1.0), -int_max+R(1.0), -uint_min+R(1.0), -uint_max+R(1.0),
- +int_min-R(1.0), +int_max-R(1.0), +uint_min-R(1.0), +uint_max-R(1.0),
- -int_min-R(1.0), -int_max-R(1.0), -uint_min-R(1.0), -uint_max-R(1.0),
- -R(443.9999425),
+ +int_min,
+ +int_max,
+ +uint_min,
+ +uint_max,
+ -int_min,
+ -int_max,
+ -uint_min,
+ -uint_max,
+ +int_min + R(0.1),
+ +int_max + R(0.1),
+ +uint_min + R(0.1),
+ +uint_max + R(0.1),
+ -int_min + R(0.1),
+ -int_max + R(0.1),
+ -uint_min + R(0.1),
+ -uint_max + R(0.1),
+ +int_min - R(0.1),
+ +int_max - R(0.1),
+ +uint_min - R(0.1),
+ +uint_max - R(0.1),
+ -int_min - R(0.1),
+ -int_max - R(0.1),
+ -uint_min - R(0.1),
+ -uint_max - R(0.1),
+ +int_min + R(1.0),
+ +int_max + R(1.0),
+ +uint_min + R(1.0),
+ +uint_max + R(1.0),
+ -int_min + R(1.0),
+ -int_max + R(1.0),
+ -uint_min + R(1.0),
+ -uint_max + R(1.0),
+ +int_min - R(1.0),
+ +int_max - R(1.0),
+ +uint_min - R(1.0),
+ +uint_max - R(1.0),
+ -int_min - R(1.0),
+ -int_max - R(1.0),
+ -uint_min - R(1.0),
+ -uint_max - R(1.0),
+ -R(443.9999425),
};
const int nvalues = sizeof values / sizeof *values;
-
- for (int i=0; i<8*nvalues+imax; ++i) {
- const realvec_t x =
- i<8*nvalues && i&1 ? RV(values[i/8]) : random(R(-10.0), R(+10.0));
- const realvec_t y =
- i<8*nvalues && i&2 ? RV(values[i/8]) : random(R(-10.0), R(+10.0));
- const realvec_t z =
- i<8*nvalues && i&4 ? RV(values[i/8]) : random(R(-10.0), R(+10.0));
+
+ for (int i = 0; i < 8 * nvalues + imax; ++i) {
+ const realvec_t x = i < 8 * nvalues && i & 1 ? RV(values[i / 8])
+ : random(R(-10.0), R(+10.0));
+ const realvec_t y = i < 8 * nvalues && i & 2 ? RV(values[i / 8])
+ : random(R(-10.0), R(+10.0));
+ const realvec_t z = i < 8 * nvalues && i & 4 ? RV(values[i / 8])
+ : random(R(-10.0), R(+10.0));
const intvec_t n = random(int_t(-10), int_t(+10));
-
+
check_real<RV>("+", local_pos, local_pos, x, R(0.0));
check_real<RV>("-", local_neg, local_neg, x, R(0.0));
-
- check_real<RV,RV>("+", local_add, local_add, x, y, R(0.0));
- check_real<RV,RV>("-", local_sub, local_sub, x, y, R(0.0));
- check_real<RV,RV>("*", local_mul, local_mul, x, y, R(0.0));
-
+
+ check_real<RV, RV>("+", local_add, local_add, x, y, R(0.0));
+ check_real<RV, RV>("-", local_sub, local_sub, x, y, R(0.0));
+ check_real<RV, RV>("*", local_mul, local_mul, x, y, R(0.0));
+
{
real_t rstd = x[0];
- for (int i=1; i<realvec_t::size; ++i) {
+ for (int i = 1; i < realvec_t::size; ++i) {
rstd += x[i];
}
real_t rvml = sum(x);
@@ -1238,7 +1190,7 @@ struct vecmathlib_test {
}
{
real_t rstd = x[0];
- for (int i=1; i<realvec_t::size; ++i) {
+ for (int i = 1; i < realvec_t::size; ++i) {
rstd *= x[i];
}
real_t rvml = prod(x);
@@ -1246,7 +1198,7 @@ struct vecmathlib_test {
}
{
real_t rstd = x[0];
- for (int i=1; i<realvec_t::size; ++i) {
+ for (int i = 1; i < realvec_t::size; ++i) {
rstd = vml_std::fmax(rstd, x[i]);
}
real_t rvml = vecmathlib::maxval(x);
@@ -1254,34 +1206,33 @@ struct vecmathlib_test {
}
{
real_t rstd = x[0];
- for (int i=1; i<realvec_t::size; ++i) {
+ for (int i = 1; i < realvec_t::size; ++i) {
rstd = vml_std::fmin(rstd, x[i]);
}
real_t rvml = vecmathlib::minval(x);
check_real("minval", rstd, rvml, x, R(0.0));
}
-
- check_bool<RV,RV>("==", local_eq, local_veq, x, y);
- check_bool<RV,RV>("!=", local_ne, local_vne, x, y);
- check_bool<RV,RV>("<", local_lt, local_vlt, x, y);
- check_bool<RV,RV>("<=", local_le, local_vle, x, y);
- check_bool<RV,RV>(">", local_gt, local_vgt, x, y);
- check_bool<RV,RV>(">=", local_ge, local_vge, x, y);
-
- check_real<RV,RV>("copysign",
- vml_std::copysign, vecmathlib::copysign, x, y, 0.0);
+
+ check_bool<RV, RV>("==", local_eq, local_veq, x, y);
+ check_bool<RV, RV>("!=", local_ne, local_vne, x, y);
+ check_bool<RV, RV>("<", local_lt, local_vlt, x, y);
+ check_bool<RV, RV>("<=", local_le, local_vle, x, y);
+ check_bool<RV, RV>(">", local_gt, local_vgt, x, y);
+ check_bool<RV, RV>(">=", local_ge, local_vge, x, y);
+
+ check_real<RV, RV>("copysign", vml_std::copysign, vecmathlib::copysign, x,
+ y, 0.0);
check_real<RV>("fabs", vml_std::fabs, vecmathlib::fabs, x, 0.0);
- check_real<RV,RV>("fdim",
- vml_std::fdim, vecmathlib::fdim, x, y, accuracy());
- check_real<RV,RV,RV>("fma",
- vml_std::fma, vecmathlib::fma,
- x, y, z, R(10.0)*accuracy());
- check_real<RV,RV>("fmax", vml_std::fmax, vecmathlib::fmax, x, y, 0.0);
- check_real<RV,RV>("fmin", vml_std::fmin, vecmathlib::fmin, x, y, 0.0);
+ check_real<RV, RV>("fdim", vml_std::fdim, vecmathlib::fdim, x, y,
+ accuracy());
+ check_real<RV, RV, RV>("fma", vml_std::fma, vecmathlib::fma, x, y, z,
+ R(10.0) * accuracy());
+ check_real<RV, RV>("fmax", vml_std::fmax, vecmathlib::fmax, x, y, 0.0);
+ check_real<RV, RV>("fmin", vml_std::fmin, vecmathlib::fmin, x, y, 0.0);
check_real<RV>("frexp0", local_frexp0, local_vfrexp0, x, 0.0);
check_int<RV>("frexp1", local_frexp1, local_vfrexp1, x);
- check_int<RV>("ilogb",
- local_ilogb, (intvec_t(*)(realvec_t))vecmathlib::ilogb, x);
+ check_int<RV>("ilogb", local_ilogb,
+ (intvec_t (*)(realvec_t))vecmathlib::ilogb, x);
#if defined VML_HAVE_INF || defined VML_HAVE_NAN
check_bool<RV>("isfinite", vml_std::isfinite, vecmathlib::isfinite, x);
#endif
@@ -1294,91 +1245,162 @@ struct vecmathlib_test {
#ifdef VML_HAVE_DENORMALS
check_bool<RV>("isnormal", vml_std::isnormal, vecmathlib::isnormal, x);
#endif
- check_real<RV,I>("ldexp", local_ldexp, vecmathlib::ldexp, x, n[0], 0.0);
- check_real<RV,IV>("ldexp", local_ldexp, vecmathlib::ldexp, x, n, 0.0);
- check_real<RV,RV,RV>("mad",
- local_mad, vecmathlib::mad,
- x, y, z, R(10.0)*accuracy());
- check_real<RV,RV>("nextafter",
- vml_std::nextafter, vecmathlib::nextafter, x, y, 0.0);
+ check_real<RV, I>("ldexp", local_ldexp, vecmathlib::ldexp, x, n[0], 0.0);
+ check_real<RV, IV>("ldexp", local_ldexp, vecmathlib::ldexp, x, n, 0.0);
+ check_real<RV, RV, RV>("mad", local_mad, vecmathlib::mad, x, y, z,
+ R(10.0) * accuracy());
+ check_real<RV, RV>("nextafter", vml_std::nextafter, vecmathlib::nextafter,
+ x, y, 0.0);
check_bool<RV>("signbit", vml_std::signbit, vecmathlib::signbit, x);
}
}
-
- static void test_convert()
- {
- cout << " testing ceil convert_float convert_int floor rint round trunc...\n"
+
+ static void test_convert() {
+ cout << " testing ceil convert_float convert_int floor rint round "
+ "trunc...\n"
<< flush;
-
+
const real_t eps = FP::epsilon();
const real_t int_min = R(std::numeric_limits<int_t>::min());
const real_t int_max = R(std::numeric_limits<int_t>::max());
const real_t uint_min = R(std::numeric_limits<uint_t>::min());
const real_t uint_max = R(std::numeric_limits<uint_t>::max());
- const real_t mantissa_max = (U(1) << (FP::mantissa_bits+1)) - U(1);
- const real_t real_max =
- (((U(1) << (FP::mantissa_bits+1)) - U(1)) << (FP::exponent_bits-1)) +
- (U(1) << (FP::exponent_bits-1)) - U(1);
+ const real_t mantissa_max = (U(1) << (FP::mantissa_bits + 1)) - U(1);
+ const real_t real_max = (((U(1) << (FP::mantissa_bits + 1)) - U(1))
+ << (FP::exponent_bits - 1)) +
+ (U(1) << (FP::exponent_bits - 1)) - U(1);
const real_t values[] = {
- R(+0.0), R(+0.1), R(+0.9), R(+1.0), R(+1.1),
- R(-0.0), R(-0.1), R(-0.9), R(-1.0), R(-1.1),
- R(+0.0)+eps, R(+0.1)+eps, R(+0.9)+eps, R(+1.0)+eps, R(+1.1)+eps,
- R(-0.0)+eps, R(-0.1)+eps, R(-0.9)+eps, R(-1.0)+eps, R(-1.1)+eps,
- R(+0.0)-eps, R(+0.1)-eps, R(+0.9)-eps, R(+1.0)-eps, R(+1.1)-eps,
- R(-0.0)-eps, R(-0.1)-eps, R(-0.9)-eps, R(-1.0)-eps, R(-1.1)-eps,
+ R(+0.0),
+ R(+0.1),
+ R(+0.9),
+ R(+1.0),
+ R(+1.1),
+ R(-0.0),
+ R(-0.1),
+ R(-0.9),
+ R(-1.0),
+ R(-1.1),
+ R(+0.0) + eps,
+ R(+0.1) + eps,
+ R(+0.9) + eps,
+ R(+1.0) + eps,
+ R(+1.1) + eps,
+ R(-0.0) + eps,
+ R(-0.1) + eps,
+ R(-0.9) + eps,
+ R(-1.0) + eps,
+ R(-1.1) + eps,
+ R(+0.0) - eps,
+ R(+0.1) - eps,
+ R(+0.9) - eps,
+ R(+1.0) - eps,
+ R(+1.1) - eps,
+ R(-0.0) - eps,
+ R(-0.1) - eps,
+ R(-0.9) - eps,
+ R(-1.0) - eps,
+ R(-1.1) - eps,
#ifdef VML_HAVE_DENORMALS
- +FP::min(), +FP::min()*(R(1.0)+eps), +FP::min()*R(2.0),
- -FP::min(), -FP::min()*(R(1.0)+eps), -FP::min()*R(2.0),
+ +FP::min(),
+ +FP::min() * (R(1.0) + eps),
+ +FP::min() * R(2.0),
+ -FP::min(),
+ -FP::min() * (R(1.0) + eps),
+ -FP::min() * R(2.0),
#endif
- +FP::max(), +FP::max()*(R(1.0)-eps), +FP::max()*(R(1.0)-R(2.0)*eps),
- -FP::max(), -FP::max()*(R(1.0)-eps), -FP::max()*(R(1.0)-R(2.0)*eps),
- +R(0.5)*FP::max(), +R(0.5)*FP::max()*(R(1.0)+eps),
- -R(0.5)*FP::max(), -R(0.5)*FP::max()*(R(1.0)+eps),
+ +FP::max(),
+ +FP::max() * (R(1.0) - eps),
+ +FP::max() * (R(1.0) - R(2.0) * eps),
+ -FP::max(),
+ -FP::max() * (R(1.0) - eps),
+ -FP::max() * (R(1.0) - R(2.0) * eps),
+ +R(0.5) * FP::max(),
+ +R(0.5) * FP::max() * (R(1.0) + eps),
+ -R(0.5) * FP::max(),
+ -R(0.5) * FP::max() * (R(1.0) + eps),
#ifdef VML_HAVE_INF
- +R(1.0/0.0), // +FP::infinity()
- -R(1.0/0.0), // -FP::infinity()
+ +R(1.0 / 0.0), // +FP::infinity()
+ -R(1.0 / 0.0), // -FP::infinity()
#endif
#ifdef VML_HAVE_NAN
- R(0.0/0.0), // FP::quiet_NaN()
+ R(0.0 / 0.0), // FP::quiet_NaN()
#endif
- +int_min, +int_max, +uint_min, +uint_max,
- -int_min, -int_max, -uint_min, -uint_max,
- +int_min+R(0.1), +int_max+R(0.1), +uint_min+R(0.1), +uint_max+R(0.1),
- -int_min+R(0.1), -int_max+R(0.1), -uint_min+R(0.1), -uint_max+R(0.1),
- +int_min-R(0.1), +int_max-R(0.1), +uint_min-R(0.1), +uint_max-R(0.1),
- -int_min-R(0.1), -int_max-R(0.1), -uint_min-R(0.1), -uint_max-R(0.1),
- +int_min+R(1.0), +int_max+R(1.0), +uint_min+R(1.0), +uint_max+R(1.0),
- -int_min+R(1.0), -int_max+R(1.0), -uint_min+R(1.0), -uint_max+R(1.0),
- +int_min-R(1.0), +int_max-R(1.0), +uint_min-R(1.0), +uint_max-R(1.0),
- -int_min-R(1.0), -int_max-R(1.0), -uint_min-R(1.0), -uint_max-R(1.0),
- +mantissa_max, +mantissa_max-R(1.0), +mantissa_max+R(1.0),
- -mantissa_max, -mantissa_max-R(1.0), -mantissa_max+R(1.0),
- +real_max, +real_max-R(1.0), +real_max+R(1.0),
- -real_max, -real_max-R(1.0), -real_max+R(1.0),
- -R(443.9999425),
+ +int_min,
+ +int_max,
+ +uint_min,
+ +uint_max,
+ -int_min,
+ -int_max,
+ -uint_min,
+ -uint_max,
+ +int_min + R(0.1),
+ +int_max + R(0.1),
+ +uint_min + R(0.1),
+ +uint_max + R(0.1),
+ -int_min + R(0.1),
+ -int_max + R(0.1),
+ -uint_min + R(0.1),
+ -uint_max + R(0.1),
+ +int_min - R(0.1),
+ +int_max - R(0.1),
+ +uint_min - R(0.1),
+ +uint_max - R(0.1),
+ -int_min - R(0.1),
+ -int_max - R(0.1),
+ -uint_min - R(0.1),
+ -uint_max - R(0.1),
+ +int_min + R(1.0),
+ +int_max + R(1.0),
+ +uint_min + R(1.0),
+ +uint_max + R(1.0),
+ -int_min + R(1.0),
+ -int_max + R(1.0),
+ -uint_min + R(1.0),
+ -uint_max + R(1.0),
+ +int_min - R(1.0),
+ +int_max - R(1.0),
+ +uint_min - R(1.0),
+ +uint_max - R(1.0),
+ -int_min - R(1.0),
+ -int_max - R(1.0),
+ -uint_min - R(1.0),
+ -uint_max - R(1.0),
+ +mantissa_max,
+ +mantissa_max - R(1.0),
+ +mantissa_max + R(1.0),
+ -mantissa_max,
+ -mantissa_max - R(1.0),
+ -mantissa_max + R(1.0),
+ +real_max,
+ +real_max - R(1.0),
+ +real_max + R(1.0),
+ -real_max,
+ -real_max - R(1.0),
+ -real_max + R(1.0),
+ -R(443.9999425),
};
const int nvalues = sizeof values / sizeof *values;
-
- for (int i=0; i<nvalues+imax; ++i) {
+
+ for (int i = 0; i < nvalues + imax; ++i) {
const realvec_t x =
- i<nvalues ? RV(values[i]) : random(R(-1.0e+10), R(+1.0e+10));
+ i < nvalues ? RV(values[i]) : random(R(-1.0e+10), R(+1.0e+10));
const intvec_t n1 = random(int_t(-100), int_t(+100));
- //const intvec_t n2 = random(int_t(-1000000000), int_t(+1000000000));
+ // const intvec_t n2 = random(int_t(-1000000000), int_t(+1000000000));
const intvec_t n2 =
- random(std::numeric_limits<int_t>::min() / 2, // avoid overflow
- std::numeric_limits<int_t>::max() / 2);
+ random(std::numeric_limits<int_t>::min() / 2, // avoid overflow
+ std::numeric_limits<int_t>::max() / 2);
const realvec_t fn1 = vecmathlib::convert_float(n1);
const realvec_t fn2 = vecmathlib::convert_float(n2);
const realvec_t fn1h = vecmathlib::convert_float(n1) * RV(0.25);
const realvec_t fn2h = vecmathlib::convert_float(n2) * RV(0.25);
- check_real<IV>("convert_float",
- FP::convert_float, vecmathlib::convert_float, n1, R(0.0));
- check_real<IV>("convert_float",
- FP::convert_float, vecmathlib::convert_float, n2, R(0.0));
+ check_real<IV>("convert_float", FP::convert_float,
+ vecmathlib::convert_float, n1, R(0.0));
+ check_real<IV>("convert_float", FP::convert_float,
+ vecmathlib::convert_float, n2, R(0.0));
// Note: RV(int_max) > int_max due to rounding
if (all(x >= RV(int_min) && x < RV(int_max))) {
- check_int<RV>("convert_int",
- FP::convert_int, vecmathlib::convert_int, x);
+ check_int<RV>("convert_int", FP::convert_int, vecmathlib::convert_int,
+ x);
}
// TODO: These should all have accuracy R(0.0) instead!
check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, x, accuracy());
@@ -1387,218 +1409,213 @@ struct vecmathlib_test {
check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, fn1h, accuracy());
check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, fn2h, accuracy());
check_real<RV>("floor", vml_std::floor, vecmathlib::floor, x, accuracy());
- check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1, accuracy());
- check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2, accuracy());
- check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1h, accuracy());
- check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2h, accuracy());
- // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, x, accuracy());
- // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1, accuracy());
- // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2, accuracy());
- // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1h, accuracy());
- // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2h, accuracy());
+ check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1,
+ accuracy());
+ check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2,
+ accuracy());
+ check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1h,
+ accuracy());
+ check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2h,
+ accuracy());
+ // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, x,
+ // accuracy());
+ // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1,
+ // accuracy());
+ // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2,
+ // accuracy());
+ // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1h,
+ // accuracy());
+ // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2h,
+ // accuracy());
check_real<RV>("rint", vml_std::rint, vecmathlib::rint, x, accuracy());
check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn1, accuracy());
check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn2, accuracy());
check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn1h, accuracy());
check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn2h, accuracy());
check_real<RV>("round", vml_std::round, vecmathlib::round, x, accuracy());
- check_real<RV>("round", vml_std::round, vecmathlib::round, fn1, accuracy());
- check_real<RV>("round", vml_std::round, vecmathlib::round, fn2, accuracy());
- check_real<RV>("round", vml_std::round, vecmathlib::round, fn1h, accuracy());
- check_real<RV>("round", vml_std::round, vecmathlib::round, fn2h, accuracy());
+ check_real<RV>("round", vml_std::round, vecmathlib::round, fn1,
+ accuracy());
+ check_real<RV>("round", vml_std::round, vecmathlib::round, fn2,
+ accuracy());
+ check_real<RV>("round", vml_std::round, vecmathlib::round, fn1h,
+ accuracy());
+ check_real<RV>("round", vml_std::round, vecmathlib::round, fn2h,
+ accuracy());
check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, x, accuracy());
- check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1, accuracy());
- check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2, accuracy());
- check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1h, accuracy());
- check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2h, accuracy());
+ check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1,
+ accuracy());
+ check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2,
+ accuracy());
+ check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1h,
+ accuracy());
+ check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2h,
+ accuracy());
}
}
-
-
-
- static void test_asin()
- {
+
+ static void test_asin() {
cout << " testing asin acos atan atan2...\n" << flush;
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x = random(R(-1.0), R(+1.0));
check_real<RV>("asin", vml_std::asin, vecmathlib::asin, x, accuracy(4));
check_real<RV>("acos", vml_std::acos, vecmathlib::acos, x, accuracy(4));
}
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x = random(R(-100.0), R(+100.0));
const realvec_t y = random(R(-100.0), R(+100.0));
check_real<RV>("atan", vml_std::atan, vecmathlib::atan, x, accuracy(5));
- check_real<RV,RV>("atan2",
- vml_std::atan2, vecmathlib::atan2, x, y, accuracy(6));
+ check_real<RV, RV>("atan2", vml_std::atan2, vecmathlib::atan2, x, y,
+ accuracy(6));
}
}
-
- static void test_asinh()
- {
+
+ static void test_asinh() {
cout << " testing asinh acosh atanh...\n" << flush;
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x = random(R(-1000.0), R(+1000.0));
- check_real<RV>("asinh",
- vml_std::asinh, vecmathlib::asinh, x, accuracy(4));
+ check_real<RV>("asinh", vml_std::asinh, vecmathlib::asinh, x,
+ accuracy(4));
}
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x = random(R(1.0), R(1000.0));
- check_real<RV>("acosh",
- vml_std::acosh, vecmathlib::acosh, x, accuracy(4));
+ check_real<RV>("acosh", vml_std::acosh, vecmathlib::acosh, x,
+ accuracy(4));
}
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x = random(R(-1.0), R(+1.0));
- check_real<RV>("atanh",
- vml_std::atanh, vecmathlib::atanh, x, accuracy(5));
+ check_real<RV>("atanh", vml_std::atanh, vecmathlib::atanh, x,
+ accuracy(5));
}
}
-
+
static real_t local_exp10(real_t x) { return pow(R(10.0), x); }
- static void test_exp()
- {
+ static void test_exp() {
cout << " testing exp exp10 exp2 expm1...\n" << flush;
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x = random(R(-100.0), R(+100.0));
check_real<RV>("exp", vml_std::exp, vecmathlib::exp, x, accuracy(3));
check_real<RV>("exp10", local_exp10, vecmathlib::exp10, x, accuracy(3));
check_real<RV>("exp2", vml_std::exp2, vecmathlib::exp2, x, accuracy(3));
- check_real<RV>("expm1",
- vml_std::expm1, vecmathlib::expm1, x, accuracy(3));
+ check_real<RV>("expm1", vml_std::expm1, vecmathlib::expm1, x,
+ accuracy(3));
}
}
-
- static void test_log()
- {
+
+ static void test_log() {
cout << " testing log log10 log1p log2...\n" << flush;
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x = random(R(1.0e-10), R(1.0e+10));
check_real<RV>("log", vml_std::log, vecmathlib::log, x, accuracy(3));
- check_real<RV>("log10",
- vml_std::log10, vecmathlib::log10, x, accuracy(3));
- check_real<RV>("log1p",
- vml_std::log1p, vecmathlib::log1p, x, accuracy(2));
+ check_real<RV>("log10", vml_std::log10, vecmathlib::log10, x,
+ accuracy(3));
+ check_real<RV>("log1p", vml_std::log1p, vecmathlib::log1p, x,
+ accuracy(2));
check_real<RV>("log2", vml_std::log2, vecmathlib::log2, x, accuracy(3));
}
}
-
- static void test_pow()
- {
+
+ static void test_pow() {
cout << " testing pow...\n" << flush;
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x = random(R(0.001), R(1000.0));
const realvec_t y = random(R(-10.0), R(+10.0));
const realvec_t ya = fabs(y);
const intvec_t n = random(I(-10), I(+10));
const realvec_t fn = vecmathlib::convert_float(n);
- check_real<RV,RV>("pow(0,y)",
- vml_std::pow, vecmathlib::pow, RV(0.0), ya,
- accuracy(16));
- check_real<RV,RV>("pow(x,0)",
- vml_std::pow, vecmathlib::pow, x, RV(0.0),
- accuracy(16));
+ check_real<RV, RV>("pow(0,y)", vml_std::pow, vecmathlib::pow, RV(0.0), ya,
+ accuracy(16));
+ check_real<RV, RV>("pow(x,0)", vml_std::pow, vecmathlib::pow, x, RV(0.0),
+ accuracy(16));
// just to check
check_real<RV>("log(x)", vml_std::log, vecmathlib::log, x, accuracy(3));
- check_real<RV,RV>("pow(x,y)",
- vml_std::pow, vecmathlib::pow, x, y, accuracy(16));
- check_real<RV,RV>("pow(-x,n)",
- vml_std::pow, vecmathlib::pow, -x, fn, accuracy(16));
+ check_real<RV, RV>("pow(x,y)", vml_std::pow, vecmathlib::pow, x, y,
+ accuracy(16));
+ check_real<RV, RV>("pow(-x,n)", vml_std::pow, vecmathlib::pow, -x, fn,
+ accuracy(16));
}
}
-
- static real_t local_rcp(real_t x) { return R(1.0)/x; }
- static void test_rcp()
- {
+
+ static real_t local_rcp(real_t x) { return R(1.0) / x; }
+ static void test_rcp() {
cout << " testing / fmod rcp remainder...\n" << flush;
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x = random(R(-100.0), R(+100.0));
const realvec_t y = random(R(-100.0), R(+100.0));
const intvec_t n = random(I(-100), I(+100));
const intvec_t m = random(I(-100), I(+100));
const realvec_t fn = vecmathlib::convert_float(n);
- const realvec_t fm = vecmathlib::convert_float
- (m + vecmathlib::convert_int(m == intvec_t(I(0))));
- check_real<RV,RV>("/", local_div, local_div, x, y, accuracy());
+ const realvec_t fm = vecmathlib::convert_float(
+ m + vecmathlib::convert_int(m == intvec_t(I(0))));
+ check_real<RV, RV>("/", local_div, local_div, x, y, accuracy());
check_real<RV>("rcp", local_rcp, vecmathlib::rcp, x, accuracy());
- check_real<RV,RV>("fmod(x,y)",
- vml_std::fmod, vecmathlib::fmod, x, y,
- 2.0*accuracy(), y);
- check_real<RV,RV>("fmod(x,m)",
- vml_std::fmod, vecmathlib::fmod, x, fm,
- 2.0*accuracy(), fm);
- check_real<RV,RV>("fmod(n,y)",
- vml_std::fmod, vecmathlib::fmod, fn, y,
- 2.0*accuracy(), y);
- check_real<RV,RV>("remainder(x,y)",
- vml_std::remainder, vecmathlib::remainder,
- x, y, R(2.0)*accuracy(), y);
- check_real<RV,RV>("remainder(x,m)",
- vml_std::remainder, vecmathlib::remainder,
- x, fm, R(2.0)*accuracy(), fm);
- check_real<RV,RV>("remainder(n,y)",
- vml_std::remainder, vecmathlib::remainder,
- fn, y, R(2.0)*accuracy(), y);
+ check_real<RV, RV>("fmod(x,y)", vml_std::fmod, vecmathlib::fmod, x, y,
+ 2.0 * accuracy(), y);
+ check_real<RV, RV>("fmod(x,m)", vml_std::fmod, vecmathlib::fmod, x, fm,
+ 2.0 * accuracy(), fm);
+ check_real<RV, RV>("fmod(n,y)", vml_std::fmod, vecmathlib::fmod, fn, y,
+ 2.0 * accuracy(), y);
+ check_real<RV, RV>("remainder(x,y)", vml_std::remainder,
+ vecmathlib::remainder, x, y, R(2.0) * accuracy(), y);
+ check_real<RV, RV>("remainder(x,m)", vml_std::remainder,
+ vecmathlib::remainder, x, fm, R(2.0) * accuracy(), fm);
+ check_real<RV, RV>("remainder(n,y)", vml_std::remainder,
+ vecmathlib::remainder, fn, y, R(2.0) * accuracy(), y);
}
}
-
- static void test_sin()
- {
+
+ static void test_sin() {
cout << " testing cos sin tan...\n" << flush;
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x = random(R(-10.0), R(+10.0));
check_real<RV>("sin", vml_std::sin, vecmathlib::sin, x, accuracy(4));
check_real<RV>("cos", vml_std::cos, vecmathlib::cos, x, accuracy(4));
}
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x0 = random(R(-1.55), R(+1.55));
const intvec_t n = random(I(-10), I(+10));
const realvec_t x = x0 + vecmathlib::convert_float(n) * RV(M_PI);
// tan loses accuracy near pi/2
// (by definition, not by implementation?)
- check_real<RV>("tan",
- vml_std::tan, vecmathlib::tan, x, R(20.0)*accuracy(5));
+ check_real<RV>("tan", vml_std::tan, vecmathlib::tan, x,
+ R(20.0) * accuracy(5));
}
}
-
- static void test_sinh()
- {
+
+ static void test_sinh() {
cout << " testing cosh sinh tanh...\n" << flush;
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x = random(R(-10.0), R(+10.0));
check_real<RV>("sinh", vml_std::sinh, vecmathlib::sinh, x, accuracy(4));
check_real<RV>("cosh", vml_std::cosh, vecmathlib::cosh, x, accuracy(4));
check_real<RV>("tanh", vml_std::tanh, vecmathlib::tanh, x, accuracy(5));
}
}
-
- static real_t local_rsqrt(real_t x) { return R(1.0)/sqrt(x); }
- static void test_sqrt()
- {
+
+ static real_t local_rsqrt(real_t x) { return R(1.0) / sqrt(x); }
+ static void test_sqrt() {
cout << " testing cbrt hypot rsqrt sqrt...\n" << flush;
- for (int i=0; i<imax; ++i) {
+ for (int i = 0; i < imax; ++i) {
const realvec_t x = random(R(1.0e-3), R(1.0e+3));
const realvec_t y = random(-R(1.0e+3), R(1.0e+3));
const realvec_t z = random(-R(1.0e+3), R(1.0e+3));
check_real<RV>("cbrt", vml_std::cbrt, vecmathlib::cbrt, x, accuracy());
- check_real<RV,RV>("hypot",
- vml_std::hypot, vecmathlib::hypot, y, z, accuracy());
+ check_real<RV, RV>("hypot", vml_std::hypot, vecmathlib::hypot, y, z,
+ accuracy());
check_real<RV>("rsqrt", local_rsqrt, vecmathlib::rsqrt, x, accuracy());
check_real<RV>("sqrt", vml_std::sqrt, vecmathlib::sqrt, x, accuracy());
}
}
-
-
-
- static void test()
- {
+
+ static void test() {
cout << "\n"
<< "Testing math functions for type " << realvec_t::name() << ":\n";
-
+
test_bool();
test_int();
test_real();
-
+
test_mem();
-
+
// Test "basic" functions first
test_abs();
test_fabs();
@@ -1615,90 +1632,86 @@ struct vecmathlib_test {
}
};
-
-
-int main(int argc, char** argv)
-{
+int main(int argc, char **argv) {
using namespace vecmathlib;
cout << "Testing math functions:\n"
- << "[" VECMATHLIB_CONFIGURATION "]\n"
- << flush;
-
- vecmathlib_test<realpseudovec<float,1> >::test();
+ << "[" VECMATHLIB_CONFIGURATION "]\n" << flush;
+
+ vecmathlib_test<realpseudovec<float, 1>>::test();
#ifdef __clang__
- vecmathlib_test<realbuiltinvec<float,1> >::test();
+ vecmathlib_test<realbuiltinvec<float, 1>>::test();
#endif
- vecmathlib_test<realtestvec<float,1> >::test();
+ vecmathlib_test<realtestvec<float, 1>>::test();
#ifdef VECMATHLIB_HAVE_VEC_FLOAT_1
- vecmathlib_test<realvec<float,1> >::test();
+ vecmathlib_test<realvec<float, 1>>::test();
#endif
- vecmathlib_test<realpseudovec<float,2> >::test();
+ vecmathlib_test<realpseudovec<float, 2>>::test();
#ifdef __clang__
- vecmathlib_test<realbuiltinvec<float,2> >::test();
+ vecmathlib_test<realbuiltinvec<float, 2>>::test();
#endif
- vecmathlib_test<realtestvec<float,2> >::test();
+ vecmathlib_test<realtestvec<float, 2>>::test();
#ifdef VECMATHLIB_HAVE_VEC_FLOAT_2
- vecmathlib_test<realvec<float,2> >::test();
+ vecmathlib_test<realvec<float, 2>>::test();
#endif
- vecmathlib_test<realpseudovec<float,4> >::test();
+ vecmathlib_test<realpseudovec<float, 4>>::test();
#ifdef __clang__
- vecmathlib_test<realbuiltinvec<float,4> >::test();
+ vecmathlib_test<realbuiltinvec<float, 4>>::test();
#endif
- vecmathlib_test<realtestvec<float,4> >::test();
+ vecmathlib_test<realtestvec<float, 4>>::test();
#ifdef VECMATHLIB_HAVE_VEC_FLOAT_4
- vecmathlib_test<realvec<float,4> >::test();
+ vecmathlib_test<realvec<float, 4>>::test();
#endif
#ifdef VECMATHLIB_HAVE_VEC_FLOAT_8
- vecmathlib_test<realpseudovec<float,8> >::test();
+ vecmathlib_test<realpseudovec<float, 8>>::test();
#ifdef __clang__
- vecmathlib_test<realbuiltinvec<float,8> >::test();
+ vecmathlib_test<realbuiltinvec<float, 8>>::test();
#endif
- vecmathlib_test<realtestvec<float,8> >::test();
- vecmathlib_test<realvec<float,8> >::test();
+ vecmathlib_test<realtestvec<float, 8>>::test();
+ vecmathlib_test<realvec<float, 8>>::test();
#endif
#ifdef VECMATHLIB_HAVE_VEC_FLOAT_16
- vecmathlib_test<realpseudovec<float,16> >::test();
+ vecmathlib_test<realpseudovec<float, 16>>::test();
#ifdef __clang__
- vecmathlib_test<realbuiltinvec<float,16> >::test();
+ vecmathlib_test<realbuiltinvec<float, 16>>::test();
#endif
- vecmathlib_test<realtestvec<float,16> >::test();
- vecmathlib_test<realvec<float,16> >::test();
+ vecmathlib_test<realtestvec<float, 16>>::test();
+ vecmathlib_test<realvec<float, 16>>::test();
#endif
-
- vecmathlib_test<realpseudovec<double,1> >::test();
+
+ vecmathlib_test<realpseudovec<double, 1>>::test();
#ifdef __clang__
- vecmathlib_test<realbuiltinvec<double,1> >::test();
+ vecmathlib_test<realbuiltinvec<double, 1>>::test();
#endif
- vecmathlib_test<realtestvec<double,1> >::test();
+ vecmathlib_test<realtestvec<double, 1>>::test();
#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_1
- vecmathlib_test<realvec<double,1> >::test();
+ vecmathlib_test<realvec<double, 1>>::test();
#endif
- vecmathlib_test<realpseudovec<double,2> >::test();
+ vecmathlib_test<realpseudovec<double, 2>>::test();
#ifdef __clang__
- vecmathlib_test<realbuiltinvec<double,2> >::test();
+ vecmathlib_test<realbuiltinvec<double, 2>>::test();
#endif
- vecmathlib_test<realtestvec<double,2> >::test();
+ vecmathlib_test<realtestvec<double, 2>>::test();
#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_2
- vecmathlib_test<realvec<double,2> >::test();
+ vecmathlib_test<realvec<double, 2>>::test();
#endif
#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_4
- vecmathlib_test<realpseudovec<double,4> >::test();
+ vecmathlib_test<realpseudovec<double, 4>>::test();
#ifdef __clang__
- vecmathlib_test<realbuiltinvec<double,4> >::test();
+ vecmathlib_test<realbuiltinvec<double, 4>>::test();
#endif
- vecmathlib_test<realtestvec<double,4> >::test();
- vecmathlib_test<realvec<double,4> >::test();
+ vecmathlib_test<realtestvec<double, 4>>::test();
+ vecmathlib_test<realvec<double, 4>>::test();
#endif
#ifdef VECMATHLIB_HAVE_VEC_DOUBLE_8
- vecmathlib_test<realpseudovec<double,8> >::test();
+ vecmathlib_test<realpseudovec<double, 8>>::test();
#ifdef __clang__
- vecmathlib_test<realbuiltinvec<double,8> >::test();
+ vecmathlib_test<realbuiltinvec<double, 8>>::test();
#endif
- vecmathlib_test<realtestvec<double,8> >::test();
- vecmathlib_test<realvec<double,8> >::test();
+ vecmathlib_test<realtestvec<double, 8>>::test();
+ vecmathlib_test<realvec<double, 8>>::test();
#endif
-
+
cout << "\n";
if (num_errors == 0) {
cout << "SUCCESS";
@@ -1706,6 +1719,6 @@ int main(int argc, char** argv)
cout << "FAILURE";
}
cout << ": " << num_errors << " errors found\n" << flush;
-
+
return num_errors == 0 ? 0 : 1;
}
diff --git a/vec_altivec_float4.h b/vec_altivec_float4.h
index 14e0308..55530b4 100644
--- a/vec_altivec_float4.h
+++ b/vec_altivec_float4.h
@@ -13,647 +13,566 @@
#include <altivec.h>
#if defined __clang__
-# define __vector vector
-# define __pixel pixel
-# define __bool bool
+#define __vector vector
+#define __pixel pixel
+#define __bool bool
#elif defined __gcc__
-# undef vector
-# undef pixel
-# undef bool
+#undef vector
+#undef pixel
+#undef bool
#elif defined __xlC__
-# define __bool bool
+#define __bool bool
#else
-# error "Unknown compiler"
+#error "Unknown compiler"
#endif
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_FLOAT_4
- template<> struct boolvec<float,4>;
- template<> struct intvec<float,4>;
- template<> struct realvec<float,4>;
-
-
-
- template<>
- struct boolvec<float,4>: floatprops<float>
- {
- static int const size = 4;
- typedef bool scalar_t;
- typedef __vector __bool int bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- private:
- // true values are -1, false values are 0
- static uint_t from_bool(bool a) { return -int_t(a); }
- static bool to_bool(uint_t a) { return a; }
- public:
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(boolvec const& x): v(x.v) {}
- // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a): v((bvector_t)vec_splats(from_bool(a))) {}
- boolvec(bool const* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const
- {
- return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
- }
- boolvec& set_elt(int n, bool a)
- {
- return
- vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
- }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec operator!() const { return vec_nor(v, v); }
-
- boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
- boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
- // boolvec operator==(boolvec x) const { return !(*this!=x); }
- boolvec operator==(boolvec x) const; // defined after intvec
- boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
-
- bool all() const { return vec_all_ne(v, BV(false).v); }
- bool any() const { return vec_any_ne(v, BV(false).v); }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<float,4>: floatprops<float>
- {
- static int const size = 4;
- typedef int_t scalar_t;
- typedef __vector signed int ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(intvec const& x): v(x.v) {}
- // intvec& operator=(intvec const& x) { return v=x.v, *this; }
- intvec(ivector_t x): v(x) {}
- intvec(int_t a): v(vec_splats(a)) {}
- intvec(int_t const* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
- static intvec iota() { return (__vector signed int){0, 1, 2, 3}; }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const
- {
- return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
- }
- intvec_t& set_elt(int n, int_t a)
- {
- return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
- }
-
-
-
- // Vector casts do not change the bit battern
- boolvec_t as_bool() const { return (__vector __bool int)v; }
- boolvec_t convert_bool() const { return *this != IV(0); }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- intvec operator+() const { return *this; }
- intvec operator-() const
- {
+template <> struct boolvec<float, 4>;
+template <> struct intvec<float, 4>;
+template <> struct realvec<float, 4>;
+
+template <> struct boolvec<float, 4> : floatprops<float> {
+ static int const size = 4;
+ typedef bool scalar_t;
+ typedef __vector __bool int bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+private:
+ // true values are -1, false values are 0
+ static uint_t from_bool(bool a) { return -int_t(a); }
+ static bool to_bool(uint_t a) { return a; }
+
+public:
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v((bvector_t)vec_splats(from_bool(a))) {}
+ boolvec(bool const *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const {
+ return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+ }
+ boolvec &set_elt(int n, bool a) {
+ return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+ *this;
+ }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec operator!() const { return vec_nor(v, v); }
+
+ boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
+ boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
+ // boolvec operator==(boolvec x) const { return !(*this!=x); }
+ boolvec operator==(boolvec x) const; // defined after intvec
+ boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
+
+ bool all() const { return vec_all_ne(v, BV(false).v); }
+ bool any() const { return vec_any_ne(v, BV(false).v); }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 4> : floatprops<float> {
+ static int const size = 4;
+ typedef int_t scalar_t;
+ typedef __vector signed int ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x) : v(x) {}
+ intvec(int_t a) : v(vec_splats(a)) {}
+ intvec(int_t const *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+ static intvec iota() { return (__vector signed int){0, 1, 2, 3}; }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const {
+ return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+ }
+ intvec_t &set_elt(int n, int_t a) {
+ return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+ }
+
+ // Vector casts do not change the bit battern
+ boolvec_t as_bool() const { return (__vector __bool int)v; }
+ boolvec_t convert_bool() const { return *this != IV(0); }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ intvec operator+() const { return *this; }
+ intvec operator-() const {
#if defined __xlC_
- return vec_neg(v);
+ return vec_neg(v);
#else
- // vec_neg does not exist in clang
- return IV(I(0)) - *this;
+ // vec_neg does not exist in clang
+ return IV(I(0)) - *this;
#endif
+ }
+
+ intvec operator+(intvec x) const { return vec_add(v, x.v); }
+ intvec operator-(intvec x) const { return vec_sub(v, x.v); }
+
+ intvec &operator+=(intvec const &x) { return *this = *this + x; }
+ intvec &operator-=(intvec const &x) { return *this = *this - x; }
+
+ intvec operator~() const { return vec_nor(v, v); }
+
+ intvec operator&(intvec x) const { return vec_and(v, x.v); }
+ intvec operator|(intvec x) const { return vec_or(v, x.v); }
+ intvec operator^(intvec x) const { return vec_xor(v, x.v); }
+
+ intvec &operator&=(intvec const &x) { return *this = *this & x; }
+ intvec &operator|=(intvec const &x) { return *this = *this | x; }
+ intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+ intvec_t lsr(int_t n) const { return lsr(IV(n)); }
+ intvec_t rotate(int_t n) const;
+ intvec operator>>(int_t n) const { return *this >> IV(n); }
+ intvec operator<<(int_t n) const { return *this << IV(n); }
+ intvec &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec_t lsr(intvec_t n) const {
+ return vec_sr(v, (__vector unsigned int)n.v);
+ }
+ intvec_t rotate(intvec_t n) const;
+ intvec operator>>(intvec n) const {
+ return vec_sra(v, (__vector unsigned int)n.v);
+ }
+ intvec operator<<(intvec n) const {
+ return vec_sl(v, (__vector unsigned int)n.v);
+ }
+ intvec &operator>>=(intvec n) { return *this = *this >> n; }
+ intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+ intvec_t clz() const;
+ intvec_t popcount() const;
+
+ boolvec_t operator==(intvec const &x) const { return vec_cmpeq(v, x.v); }
+ boolvec_t operator!=(intvec const &x) const { return !(*this == x); }
+ boolvec_t operator<(intvec const &x) const { return vec_cmplt(v, x.v); }
+ boolvec_t operator<=(intvec const &x) const { return !(*this > x); }
+ boolvec_t operator>(intvec const &x) const { return vec_cmpgt(v, x.v); }
+ boolvec_t operator>=(intvec const &x) const { return !(*this < x); }
+
+ intvec_t abs() const { return vec_abs(v); }
+ boolvec_t isignbit() const { return (*this >> (bits - 1)).as_bool(); }
+ intvec_t max(intvec_t x) const { return vec_max(v, x.v); }
+ intvec_t min(intvec_t x) const { return vec_min(v, x.v); }
+};
+
+template <> struct realvec<float, 4> : floatprops<float> {
+ static int const size = 4;
+ typedef real_t scalar_t;
+ typedef __vector float vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static char const *name() { return "<Altivec:4*float>"; }
+ void barrier() { __asm__("" : "+v"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x) : v(x) {}
+ realvec(real_t a) : v(vec_splats(a)) {}
+ realvec(real_t const *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const {
+ return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+ }
+ realvec_t &set_elt(int n, real_t a) {
+ return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+ }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return vec_ld(0, p);
+ }
+ static realvec_t loadu(real_t const *p) {
+ realvec_t v0 = vec_ld(0, p);
+ realvec_t v1 = vec_ld(15, p);
+ return vec_perm(v0.v, v1.v, vec_lvsl(0, p));
+ }
+ static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
}
-
- intvec operator+(intvec x) const { return vec_add(v, x.v); }
- intvec operator-(intvec x) const { return vec_sub(v, x.v); }
-
- intvec& operator+=(intvec const& x) { return *this=*this+x; }
- intvec& operator-=(intvec const& x) { return *this=*this-x; }
-
-
-
- intvec operator~() const { return vec_nor(v, v); }
-
- intvec operator&(intvec x) const { return vec_and(v, x.v); }
- intvec operator|(intvec x) const { return vec_or(v, x.v); }
- intvec operator^(intvec x) const { return vec_xor(v, x.v); }
-
- intvec& operator&=(intvec const& x) { return *this=*this&x; }
- intvec& operator|=(intvec const& x) { return *this=*this|x; }
- intvec& operator^=(intvec const& x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const;
-
-
-
- intvec_t lsr(int_t n) const { return lsr(IV(n)); }
- intvec_t rotate(int_t n) const;
- intvec operator>>(int_t n) const { return *this >> IV(n); }
- intvec operator<<(int_t n) const { return *this << IV(n); }
- intvec& operator>>=(int_t n) { return *this=*this>>n; }
- intvec& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec_t lsr(intvec_t n) const
- {
- return vec_sr(v, (__vector unsigned int)n.v);
- }
- intvec_t rotate(intvec_t n) const;
- intvec operator>>(intvec n) const
- {
- return vec_sra(v, (__vector unsigned int)n.v);
- }
- intvec operator<<(intvec n) const
- {
- return vec_sl(v, (__vector unsigned int)n.v);
- }
- intvec& operator>>=(intvec n) { return *this=*this>>n; }
- intvec& operator<<=(intvec n) { return *this=*this<<n; }
-
- intvec_t clz() const;
- intvec_t popcount() const;
-
-
-
- boolvec_t operator==(intvec const& x) const { return vec_cmpeq(v, x.v); }
- boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
- boolvec_t operator<(intvec const& x) const { return vec_cmplt(v, x.v); }
- boolvec_t operator<=(intvec const& x) const { return !(*this > x); }
- boolvec_t operator>(intvec const& x) const { return vec_cmpgt(v, x.v); }
- boolvec_t operator>=(intvec const& x) const { return !(*this < x); }
-
- intvec_t abs() const { return vec_abs(v); }
- boolvec_t isignbit() const { return (*this >> (bits-1)).as_bool(); }
- intvec_t max(intvec_t x) const { return vec_max(v, x.v); }
- intvec_t min(intvec_t x) const { return vec_min(v, x.v); }
- };
-
-
-
- template<>
- struct realvec<float,4>: floatprops<float>
- {
- static int const size = 4;
- typedef real_t scalar_t;
- typedef __vector float vector_t;
- static int const alignment = sizeof(vector_t);
-
- static char const* name() { return "<Altivec:4*float>"; }
- void barrier() { __asm__("": "+v"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(realvec const& x): v(x.v) {}
- // realvec& operator=(realvec const& x) { return v=x.v, *this; }
- realvec(vector_t x): v(x) {}
- realvec(real_t a): v(vec_splats(a)) {}
- realvec(real_t const* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const
- {
- return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
- }
- realvec_t& set_elt(int n, real_t a)
- {
- return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
- }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return vec_ld(0, p);
- }
- static realvec_t loadu(real_t const* p)
- {
- realvec_t v0 = vec_ld(0, p);
- realvec_t v1 = vec_ld(15, p);
- return vec_perm(v0.v, v1.v, vec_lvsl(0, p));
- }
- static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff);
- return loadu(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return m.m.ifthen(loada(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return m.m.ifthen(loadu(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff, m);
- return loadu(p+ioff, m);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- vec_st(v, 0, p);
- }
- void storeu(real_t* p) const
- {
- // Vector stores would require vector loads, which would need to
- // be atomic
- // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
- p[0] = (*this)[0];
- p[1] = (*this)[1];
- p[2] = (*this)[2];
- p[3] = (*this)[3];
- }
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff);
- storeu(p+ioff);
- }
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- } else {
- // Use vec_ste?
- if (m.m[0]) p[0] = (*this)[0];
- if (m.m[1]) p[1] = (*this)[1];
- if (m.m[2]) p[2] = (*this)[2];
- if (m.m[3]) p[3] = (*this)[3];
- }
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
}
- void storeu(real_t* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- } else {
- // Use vec_ste?
- if (m.m[0]) p[0] = (*this)[0];
- if (m.m[1]) p[1] = (*this)[1];
- if (m.m[2]) p[2] = (*this)[2];
- if (m.m[3]) p[3] = (*this)[3];
- }
+ }
+ realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff, m);
+ return loadu(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ vec_st(v, 0, p);
+ }
+ void storeu(real_t *p) const {
+ // Vector stores would require vector loads, which would need to
+ // be atomic
+ // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html>
+ // for good ideas
+ p[0] = (*this)[0];
+ p[1] = (*this)[1];
+ p[2] = (*this)[2];
+ p[3] = (*this)[3];
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ // Use vec_ste?
+ if (m.m[0])
+ p[0] = (*this)[0];
+ if (m.m[1])
+ p[1] = (*this)[1];
+ if (m.m[2])
+ p[2] = (*this)[2];
+ if (m.m[3])
+ p[3] = (*this)[3];
}
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff, m);
- storeu(p+ioff, m);
+ }
+ void storeu(real_t *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ // Use vec_ste?
+ if (m.m[0])
+ p[0] = (*this)[0];
+ if (m.m[1])
+ p[1] = (*this)[1];
+ if (m.m[2])
+ p[2] = (*this)[2];
+ if (m.m[3])
+ p[3] = (*this)[3];
}
-
-
-
- intvec_t as_int() const { return (__vector signed int) v; }
- intvec_t convert_int() const
- {
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff, m);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return (__vector signed int)v; }
+ intvec_t convert_int() const {
#if defined __xlC__
- return vec_cts(v, 0);
+ return vec_cts(v, 0);
#else
- // vec_cts leads to an ICE in clang
- return MF::vml_convert_int(*this);
+ // vec_cts leads to an ICE in clang
+ return MF::vml_convert_int(*this);
#endif
- }
-
-
-
- realvec operator+() const { return *this; }
- realvec operator-() const
- {
+ }
+
+ realvec operator+() const { return *this; }
+ realvec operator-() const {
#if defined __xlC_
- return vec_neg(v);
+ return vec_neg(v);
#else
- // vec_neg does not exist in clang
- return RV(0.0) - *this;
+ // vec_neg does not exist in clang
+ return RV(0.0) - *this;
#endif
- }
-
- realvec operator+(realvec x) const { return vec_add(v, x.v); }
- realvec operator-(realvec x) const { return vec_sub(v, x.v); }
- realvec operator*(realvec x) const {
+ }
+
+ realvec operator+(realvec x) const { return vec_add(v, x.v); }
+ realvec operator-(realvec x) const { return vec_sub(v, x.v); }
+ realvec operator*(realvec x) const {
#if defined __xlC__
- return vec_mul(v, x.v);
+ return vec_mul(v, x.v);
#else
- // vec_mul does not exist in clang
- return vec_madd(v, x.v, RV(0.0).v);
+ // vec_mul does not exist in clang
+ return vec_madd(v, x.v, RV(0.0).v);
#endif
- }
- realvec operator/(realvec x) const {
+ }
+ realvec operator/(realvec x) const {
#if defined __xlC__
- return vec_div(v, x.v);
+ return vec_div(v, x.v);
#else
- // vec_div does not exist in clang
- return *this * x.rcp();
+ // vec_div does not exist in clang
+ return *this * x.rcp();
#endif
- }
-
- realvec& operator+=(realvec const& x) { return *this=*this+x; }
- realvec& operator-=(realvec const& x) { return *this=*this-x; }
- realvec& operator*=(realvec const& x) { return *this=*this*x; }
- realvec& operator/=(realvec const& x) { return *this=*this/x; }
-
- real_t maxval() const
- {
- return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
- vml_std::fmax((*this)[2], (*this)[3]));
- }
- real_t minval() const
- {
- return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
- vml_std::fmin((*this)[2], (*this)[3]));
- }
- real_t prod() const
- {
- return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
- }
- real_t sum() const
- {
- return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
- }
-
-
-
- boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); }
- boolvec_t operator!=(realvec const& x) const { return ! (*this == x); }
- boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); }
- boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); }
- boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); }
- boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); }
-
-
-
- realvec acos() const { return MF::vml_acos(*this); }
- realvec acosh() const { return MF::vml_acosh(*this); }
- realvec asin() const { return MF::vml_asin(*this); }
- realvec asinh() const { return MF::vml_asinh(*this); }
- realvec atan() const { return MF::vml_atan(*this); }
- realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
- realvec atanh() const { return MF::vml_atanh(*this); }
- realvec cbrt() const { return MF::vml_cbrt(*this); }
- realvec ceil() const { return vec_ceil(v); }
- realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
- realvec cos() const { return MF::vml_cos(*this); }
- realvec cosh() const { return MF::vml_cosh(*this); }
- realvec exp() const { return MF::vml_exp(*this); }
- realvec exp10() const { return MF::vml_exp10(*this); }
- realvec exp2() const { return MF::vml_exp2(*this); }
- realvec expm1() const { return MF::vml_expm1(*this); }
- realvec fabs() const { return vec_abs(v); }
- realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
- realvec floor() const { return vec_floor(v); }
- realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
- realvec fmax(realvec y) const { return vec_max(v, y.v); }
- realvec fmin(realvec y) const { return vec_min(v, y.v); }
- realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
- realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
- realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
- intvec_t ilogb() const { return MF::vml_ilogb(*this); }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const { return MF::vml_isnan(*this); }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
- realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
- realvec log() const { return MF::vml_log(*this); }
- realvec log10() const { return MF::vml_log10(*this); }
- realvec log1p() const { return MF::vml_log1p(*this); }
- realvec log2() const { return MF::vml_log2(*this); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return vec_madd(v, y.v, z.v);
- }
- realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
- realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
- realvec rcp() const
- {
- realvec x = *this;
- realvec r = vec_re(v); // this is only an approximation
- // TODO: use fma
- // Note: don't rewrite this expression, this may introduce
- // cancellation errors
- r += r * (RV(1.0) - x*r); // one Newton iteration (see vml_rcp)
- return r;
- }
- realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
- realvec rint() const { return vec_round(v); /* sic! */ }
- realvec round() const { return MF::vml_round(*this); }
- realvec rsqrt() const
- {
- realvec x = *this;
- realvec r = vec_rsqrte(x.v); // this is only an approximation
- // TODO: use fma
- // one Newton iteration (see vml_rsqrt)
- r += RV(0.5)*r * (RV(1.0) - x * r*r);
- return r;
- }
- boolvec_t signbit() const { return MF::vml_signbit(*this); }
- realvec sin() const { return MF::vml_sin(*this); }
- realvec sinh() const { return MF::vml_sinh(*this); }
- realvec sqrt() const {
+ }
+
+ realvec &operator+=(realvec const &x) { return *this = *this + x; }
+ realvec &operator-=(realvec const &x) { return *this = *this - x; }
+ realvec &operator*=(realvec const &x) { return *this = *this * x; }
+ realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+ real_t maxval() const {
+ return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+ vml_std::fmax((*this)[2], (*this)[3]));
+ }
+ real_t minval() const {
+ return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+ vml_std::fmin((*this)[2], (*this)[3]));
+ }
+ real_t prod() const {
+ return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+ }
+ real_t sum() const {
+ return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+ }
+
+ boolvec_t operator==(realvec const &x) const { return vec_cmpeq(v, x.v); }
+ boolvec_t operator!=(realvec const &x) const { return !(*this == x); }
+ boolvec_t operator<(realvec const &x) const { return vec_cmplt(v, x.v); }
+ boolvec_t operator<=(realvec const &x) const { return vec_cmple(v, x.v); }
+ boolvec_t operator>(realvec const &x) const { return vec_cmpgt(v, x.v); }
+ boolvec_t operator>=(realvec const &x) const { return vec_cmpge(v, x.v); }
+
+ realvec acos() const { return MF::vml_acos(*this); }
+ realvec acosh() const { return MF::vml_acosh(*this); }
+ realvec asin() const { return MF::vml_asin(*this); }
+ realvec asinh() const { return MF::vml_asinh(*this); }
+ realvec atan() const { return MF::vml_atan(*this); }
+ realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+ realvec atanh() const { return MF::vml_atanh(*this); }
+ realvec cbrt() const { return MF::vml_cbrt(*this); }
+ realvec ceil() const { return vec_ceil(v); }
+ realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+ realvec cos() const { return MF::vml_cos(*this); }
+ realvec cosh() const { return MF::vml_cosh(*this); }
+ realvec exp() const { return MF::vml_exp(*this); }
+ realvec exp10() const { return MF::vml_exp10(*this); }
+ realvec exp2() const { return MF::vml_exp2(*this); }
+ realvec expm1() const { return MF::vml_expm1(*this); }
+ realvec fabs() const { return vec_abs(v); }
+ realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+ realvec floor() const { return vec_floor(v); }
+ realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
+ realvec fmax(realvec y) const { return vec_max(v, y.v); }
+ realvec fmin(realvec y) const { return vec_min(v, y.v); }
+ realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+ realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+ realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const { return MF::vml_isnan(*this); }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec log() const { return MF::vml_log(*this); }
+ realvec log10() const { return MF::vml_log10(*this); }
+ realvec log1p() const { return MF::vml_log1p(*this); }
+ realvec log2() const { return MF::vml_log2(*this); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return vec_madd(v, y.v, z.v);
+ }
+ realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+ realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+ realvec rcp() const {
+ realvec x = *this;
+ realvec r = vec_re(v); // this is only an approximation
+ // TODO: use fma
+ // Note: don't rewrite this expression, this may introduce
+ // cancellation errors
+ r += r * (RV(1.0) - x * r); // one Newton iteration (see vml_rcp)
+ return r;
+ }
+ realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+ realvec rint() const { return vec_round(v); /* sic! */ }
+ realvec round() const { return MF::vml_round(*this); }
+ realvec rsqrt() const {
+ realvec x = *this;
+ realvec r = vec_rsqrte(x.v); // this is only an approximation
+ // TODO: use fma
+ // one Newton iteration (see vml_rsqrt)
+ r += RV(0.5) * r * (RV(1.0) - x * r * r);
+ return r;
+ }
+ boolvec_t signbit() const { return MF::vml_signbit(*this); }
+ realvec sin() const { return MF::vml_sin(*this); }
+ realvec sinh() const { return MF::vml_sinh(*this); }
+ realvec sqrt() const {
#if defined __xlC__
- return vec_sqrt(v);
+ return vec_sqrt(v);
#else
- return *this * rsqrt();
+ return *this * rsqrt();
#endif
- }
- realvec tan() const { return MF::vml_tan(*this); }
- realvec tanh() const { return MF::vml_tanh(*this); }
- realvec trunc() const { return vec_trunc(v); }
- };
-
-
-
- // boolvec definitions
-
- inline intvec<float,4> boolvec<float,4>::as_int() const
- {
- return (__vector signed int) v;
- }
-
- inline intvec<float,4> boolvec<float,4>::convert_int() const
- {
- return -(__vector signed int)v;
- }
-
- inline boolvec<float,4> boolvec<float,4>::operator==(boolvec_t x) const
- {
- return as_int() == x.as_int();
- }
-
- inline
- boolvec<float,4> boolvec<float,4>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return vec_sel(y.v, x.v, v);
- }
-
- inline
- intvec<float,4> boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const
- {
- return vec_sel(y.v, x.v, v);
- }
-
- inline
- realvec<float,4> boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const
- {
- return vec_sel(y.v, x.v, v);
- }
-
-
-
- // intvec definitions
-
- inline realvec<float,4> intvec<float,4>::as_float() const
- {
- return (__vector float)v;
- }
-
- inline intvec<float,4> intvec<float,4>::bitifthen(intvec_t x,
- intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
- inline intvec<float,4> intvec<float,4>::clz() const
- {
- return MF::vml_clz(*this);
- }
-
- inline realvec<float,4> intvec<float,4>::convert_float() const
- {
+ }
+ realvec tan() const { return MF::vml_tan(*this); }
+ realvec tanh() const { return MF::vml_tanh(*this); }
+ realvec trunc() const { return vec_trunc(v); }
+};
+
+// boolvec definitions
+
+inline intvec<float, 4> boolvec<float, 4>::as_int() const {
+ return (__vector signed int)v;
+}
+
+inline intvec<float, 4> boolvec<float, 4>::convert_int() const {
+ return -(__vector signed int)v;
+}
+
+inline boolvec<float, 4> boolvec<float, 4>::operator==(boolvec_t x) const {
+ return as_int() == x.as_int();
+}
+
+inline boolvec<float, 4> boolvec<float, 4>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return vec_sel(y.v, x.v, v);
+}
+
+inline intvec<float, 4> boolvec<float, 4>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return vec_sel(y.v, x.v, v);
+}
+
+inline realvec<float, 4> boolvec<float, 4>::ifthen(realvec_t x,
+ realvec_t y) const {
+ return vec_sel(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline realvec<float, 4> intvec<float, 4>::as_float() const {
+ return (__vector float)v;
+}
+
+inline intvec<float, 4> intvec<float, 4>::bitifthen(intvec_t x,
+ intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<float, 4> intvec<float, 4>::clz() const {
+ return MF::vml_clz(*this);
+}
+
+inline realvec<float, 4> intvec<float, 4>::convert_float() const {
#if defined __xlC__
- return vec_ctf(v, 0);
+ return vec_ctf(v, 0);
#else
- // vec_ctf leads to an ICE in clang
- return MF::vml_convert_float(*this);
+ // vec_ctf leads to an ICE in clang
+ return MF::vml_convert_float(*this);
#endif
- }
-
- inline intvec<float,4> intvec<float,4>::popcount() const
- {
- return MF::vml_popcount(*this);
- }
-
- inline intvec<float,4> intvec<float,4>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
- inline intvec<float,4> intvec<float,4>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
+}
+
+inline intvec<float, 4> intvec<float, 4>::popcount() const {
+ return MF::vml_popcount(*this);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_ALTIVEC_FLOAT4_H
+#endif // #ifndef VEC_ALTIVEC_FLOAT4_H
diff --git a/vec_avx_double4.h b/vec_avx_double4.h
index 1352712..f01e74c 100644
--- a/vec_avx_double4.h
+++ b/vec_avx_double4.h
@@ -12,288 +12,244 @@
// AVX intrinsics
#include <immintrin.h>
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_DOUBLE_4
- template<> struct boolvec<double,4>;
- template<> struct intvec<double,4>;
- template<> struct realvec<double,4>;
-
-
-
- template<>
- struct boolvec<double,4>: floatprops<double>
- {
- static int const size = 4;
- typedef bool scalar_t;
- typedef __m256d bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- private:
- // true values have the sign bit set, false values have it unset
- static uint_t from_bool(bool a) { return - uint_t(a); }
- static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
- public:
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(boolvec const& x): v(x.v) {}
- // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a):
- v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {}
- boolvec(bool const* as):
- v(_mm256_castsi256_pd(_mm256_set_epi64x(from_bool(as[3]),
- from_bool(as[2]),
- from_bool(as[1]),
- from_bool(as[0])))) {}
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const
- {
- return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
- }
- boolvec_t& set_elt(int n, bool a)
- {
- return
- vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
- }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec_t operator!() const { return _mm256_xor_pd(boolvec(true), v); }
-
- boolvec_t operator&&(boolvec_t x) const { return _mm256_and_pd(v, x.v); }
- boolvec_t operator||(boolvec_t x) const { return _mm256_or_pd(v, x.v); }
- boolvec_t operator==(boolvec_t x) const { return !(*this!=x); }
- boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_pd(v, x.v); }
-
- bool all() const
- {
- // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
- return ! (! *this).any();
- }
- bool any() const
- {
- // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
- return ! bool(_mm256_testz_pd(v, v));
- }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<double,4>: floatprops<double>
- {
- static int const size = 4;
- typedef int_t scalar_t;
- typedef __m256i ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(intvec const& x): v(x.v) {}
- // intvec& operator=(intvec const& x) { return v=x.v, *this; }
- intvec(ivector_t x): v(x) {}
- intvec(int_t a): v(_mm256_set1_epi64x(a)) {}
- intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {}
- static intvec_t iota() { return _mm256_set_epi64x(3, 2, 1, 0); }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const
- {
- return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
- }
- intvec_t& set_elt(int n, int_t a)
- {
- return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
- }
-
-
-
- boolvec_t as_bool() const { return _mm256_castsi256_pd(v); }
- boolvec_t convert_bool() const
- {
- // Result: convert_bool(0)=false, convert_bool(else)=true
+template <> struct boolvec<double, 4>;
+template <> struct intvec<double, 4>;
+template <> struct realvec<double, 4>;
+
+template <> struct boolvec<double, 4> : floatprops<double> {
+ static int const size = 4;
+ typedef bool scalar_t;
+ typedef __m256d bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+private:
+ // true values have the sign bit set, false values have it unset
+ static uint_t from_bool(bool a) { return -uint_t(a); }
+ static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {}
+ boolvec(bool const *as)
+ : v(_mm256_castsi256_pd(
+ _mm256_set_epi64x(from_bool(as[3]), from_bool(as[2]),
+ from_bool(as[1]), from_bool(as[0])))) {}
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const {
+ return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+ }
+ boolvec_t &set_elt(int n, bool a) {
+ return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+ *this;
+ }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec_t operator!() const { return _mm256_xor_pd(boolvec(true), v); }
+
+ boolvec_t operator&&(boolvec_t x) const { return _mm256_and_pd(v, x.v); }
+ boolvec_t operator||(boolvec_t x) const { return _mm256_or_pd(v, x.v); }
+ boolvec_t operator==(boolvec_t x) const { return !(*this != x); }
+ boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_pd(v, x.v); }
+
+ bool all() const {
+ // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+ return !(!*this).any();
+ }
+ bool any() const {
+ // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+ return !bool(_mm256_testz_pd(v, v));
+ }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 4> : floatprops<double> {
+ static int const size = 4;
+ typedef int_t scalar_t;
+ typedef __m256i ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x) : v(x) {}
+ intvec(int_t a) : v(_mm256_set1_epi64x(a)) {}
+ intvec(int_t const *as) : v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {}
+ static intvec_t iota() { return _mm256_set_epi64x(3, 2, 1, 0); }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const {
+ return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+ }
+ intvec_t &set_elt(int n, int_t a) {
+ return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+ }
+
+ boolvec_t as_bool() const { return _mm256_castsi256_pd(v); }
+ boolvec_t convert_bool() const {
+// Result: convert_bool(0)=false, convert_bool(else)=true
#ifdef __AVX2__
- return *this != IV(I(0));
+ return *this != IV(I(0));
#else
- // There is no intrinsic to compare to zero. Instead, we check
- // whether x is positive and x-1 is negative.
- intvec_t x = *this;
- // We know that boolvec_t values depend only on the sign bit
- // return (~(x-1) | x).as_bool();
- // return x.as_bool() || !(x-1).as_bool();
- return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+ // There is no intrinsic to compare to zero. Instead, we check
+ // whether x is positive and x-1 is negative.
+ intvec_t x = *this;
+ // We know that boolvec_t values depend only on the sign bit
+ // return (~(x-1) | x).as_bool();
+ // return x.as_bool() || !(x-1).as_bool();
+ return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
#endif
- }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- // Note: not all arithmetic operations are supported!
-
- intvec_t operator+() const { return *this; }
- intvec_t operator-() const { return IV(I(0)) - *this; }
-
- intvec_t operator+(intvec_t x) const
- {
+ }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ // Note: not all arithmetic operations are supported!
+
+ intvec_t operator+() const { return *this; }
+ intvec_t operator-() const { return IV(I(0)) - *this; }
+
+ intvec_t operator+(intvec_t x) const {
#ifdef __AVX2__
- return _mm256_add_epi64(v, x.v);
+ return _mm256_add_epi64(v, x.v);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- __m128i xvlo = _mm256_castsi256_si128(x.v);
- __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
- vlo = _mm_add_epi64(vlo, xvlo);
- vhi = _mm_add_epi64(vhi, xvhi);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ __m128i xvlo = _mm256_castsi256_si128(x.v);
+ __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+ vlo = _mm_add_epi64(vlo, xvlo);
+ vhi = _mm_add_epi64(vhi, xvhi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec_t operator-(intvec_t x) const
- {
+ }
+ intvec_t operator-(intvec_t x) const {
#ifdef __AVX2__
- return _mm256_sub_epi64(v, x.v);
+ return _mm256_sub_epi64(v, x.v);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- __m128i xvlo = _mm256_castsi256_si128(x.v);
- __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
- vlo = _mm_sub_epi64(vlo, xvlo);
- vhi = _mm_sub_epi64(vhi, xvhi);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ __m128i xvlo = _mm256_castsi256_si128(x.v);
+ __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+ vlo = _mm_sub_epi64(vlo, xvlo);
+ vhi = _mm_sub_epi64(vhi, xvhi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
-
- intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
- intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-
-
-
- intvec_t operator~() const { return IV(~U(0)) ^ *this; }
-
- intvec_t operator&(intvec_t x) const
- {
+ }
+
+ intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+ intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+
+ intvec_t operator~() const { return IV(~U(0)) ^ *this; }
+
+ intvec_t operator&(intvec_t x) const {
#ifdef __AVX2__
- return _mm256_and_si256(v, x.v);
+ return _mm256_and_si256(v, x.v);
#else
- return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(v),
- _mm256_castsi256_pd(x.v)));
+ return _mm256_castpd_si256(
+ _mm256_and_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v)));
#endif
- }
- intvec_t operator|(intvec_t x) const
- {
+ }
+ intvec_t operator|(intvec_t x) const {
#ifdef __AVX2__
- return _mm256_or_si256(v, x.v);
+ return _mm256_or_si256(v, x.v);
#else
- return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(v),
- _mm256_castsi256_pd(x.v)));
+ return _mm256_castpd_si256(
+ _mm256_or_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v)));
#endif
- }
- intvec_t operator^(intvec_t x) const
- {
+ }
+ intvec_t operator^(intvec_t x) const {
#ifdef __AVX2__
- return _mm256_xor_si256(v, x.v);
+ return _mm256_xor_si256(v, x.v);
#else
- return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(v),
- _mm256_castsi256_pd(x.v)));
+ return _mm256_castpd_si256(
+ _mm256_xor_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v)));
#endif
- }
-
- intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
- intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
- intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const;
-
-
-
- intvec_t lsr(int_t n) const
- {
+ }
+
+ intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+ intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+ intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+ intvec_t lsr(int_t n) const {
#ifdef __AVX2__
- return _mm256_srli_epi64(v, n);
+ return _mm256_srli_epi64(v, n);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- vlo = _mm_srli_epi64(vlo, n);
- vhi = _mm_srli_epi64(vhi, n);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ vlo = _mm_srli_epi64(vlo, n);
+ vhi = _mm_srli_epi64(vhi, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec_t rotate(int_t n) const;
- intvec_t operator>>(int_t n) const
- {
+ }
+ intvec_t rotate(int_t n) const;
+ intvec_t operator>>(int_t n) const {
#ifdef __AVX2__
- // There is no _mm256_srai_epi64. To emulate it, add 0x80000000
- // before shifting, and subtract the shifted 0x80000000 after
- // shifting
- intvec_t offset = U(1) << (bits-1);
- return (*this + offset).lsr(n) - offset.lsr(n);
+ // There is no _mm256_srai_epi64. To emulate it, add 0x80000000
+ // before shifting, and subtract the shifted 0x80000000 after
+ // shifting
+ intvec_t offset = U(1) << (bits - 1);
+ return (*this + offset).lsr(n) - offset.lsr(n);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- // There is no _mm_srai_epi64. To emulate it, add 0x80000000
- // before shifting, and subtract the shifted 0x80000000 after
- // shifting
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+// There is no _mm_srai_epi64. To emulate it, add 0x80000000
+// before shifting, and subtract the shifted 0x80000000 after
+// shifting
#if 0
__m128i signmask01 = _mm_sub_epi64(_mm_set1_epi64x(0),
_mm_srli_epi64(vlo, 63));
@@ -306,532 +262,445 @@ namespace vecmathlib {
vlo = _mm_xor_si128(signmask01, vlo);
vhi = _mm_xor_si128(signmask23, vhi);
#else
- // Convert signed to unsiged
- vlo = _mm_add_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1)));
- vhi = _mm_add_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1)));
- // Shift
- vlo = _mm_srli_epi64(vlo, n);
- vhi = _mm_srli_epi64(vhi, n);
- // Undo conversion
- vlo = _mm_sub_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1-n)));
- vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1-n)));
+ // Convert signed to unsiged
+ vlo = _mm_add_epi64(vlo, _mm_set1_epi64x(U(1) << (bits - 1)));
+ vhi = _mm_add_epi64(vhi, _mm_set1_epi64x(U(1) << (bits - 1)));
+ // Shift
+ vlo = _mm_srli_epi64(vlo, n);
+ vhi = _mm_srli_epi64(vhi, n);
+ // Undo conversion
+ vlo = _mm_sub_epi64(vlo, _mm_set1_epi64x(U(1) << (bits - 1 - n)));
+ vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits - 1 - n)));
#endif
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec_t operator<<(int_t n) const
- {
+ }
+ intvec_t operator<<(int_t n) const {
#ifdef __AVX2__
- return _mm256_slli_epi64(v, n);
+ return _mm256_slli_epi64(v, n);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- vlo = _mm_slli_epi64(vlo, n);
- vhi = _mm_slli_epi64(vhi, n);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ vlo = _mm_slli_epi64(vlo, n);
+ vhi = _mm_slli_epi64(vhi, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec_t lsr(intvec_t n) const
- {
+ }
+ intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec_t lsr(intvec_t n) const {
#ifdef __AVX2__
- return _mm256_srlv_epi64(v, n.v);
+ return _mm256_srlv_epi64(v, n.v);
#else
- intvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, U((*this)[i]) >> U(n[i]));
- }
- return r;
-#endif
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, U((*this)[i]) >> U(n[i]));
}
- intvec_t rotate(intvec_t n) const;
- intvec_t operator>>(intvec_t n) const
- {
+ return r;
+#endif
+ }
+ intvec_t rotate(intvec_t n) const;
+ intvec_t operator>>(intvec_t n) const {
#ifdef __AVX2__
- // See operator>> above
- intvec_t offset = U(1) << (bits-1);
- return (*this + offset).lsr(n) - offset.lsr(n);
+ // See operator>> above
+ intvec_t offset = U(1) << (bits - 1);
+ return (*this + offset).lsr(n) - offset.lsr(n);
#else
- intvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] >> n[i]);
- }
- return r;
-#endif
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] >> n[i]);
}
- intvec_t operator<<(intvec_t n) const
- {
+ return r;
+#endif
+ }
+ intvec_t operator<<(intvec_t n) const {
#ifdef __AVX2__
- return _mm256_sllv_epi64(v, n.v);
+ return _mm256_sllv_epi64(v, n.v);
#else
- intvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] << n[i]);
- }
- return r;
-#endif
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] << n[i]);
}
- intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-
- intvec_t clz() const;
- intvec_t popcount() const;
-
-
-
- boolvec_t operator==(intvec_t const& x) const
- {
+ return r;
+#endif
+ }
+ intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+ intvec_t clz() const;
+ intvec_t popcount() const;
+
+ boolvec_t operator==(intvec_t const &x) const {
#ifdef __AVX2__
- return _mm256_castsi256_pd(_mm256_cmpeq_epi64(v, x.v));
+ return _mm256_castsi256_pd(_mm256_cmpeq_epi64(v, x.v));
#else
- return ! (*this != x);
+ return !(*this != x);
#endif
- }
- boolvec_t operator!=(intvec_t const& x) const
- {
+ }
+ boolvec_t operator!=(intvec_t const &x) const {
#ifdef __AVX2__
- return ! (*this == x);
+ return !(*this == x);
#else
- return (*this ^ x).convert_bool();
+ return (*this ^ x).convert_bool();
#endif
- }
- boolvec_t operator<(intvec_t const& x) const
- {
+ }
+ boolvec_t operator<(intvec_t const &x) const {
#ifdef __AVX2__
- return _mm256_castsi256_pd(_mm256_cmpgt_epi64(x.v, v));
+ return _mm256_castsi256_pd(_mm256_cmpgt_epi64(x.v, v));
#else
- // return (*this - x).as_bool();
- boolvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] < x[i]);
- }
- return r;
-#endif
- }
- boolvec_t operator<=(intvec_t const& x) const
- {
- return ! (*this > x);
+ // return (*this - x).as_bool();
+ boolvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] < x[i]);
}
- boolvec_t operator>(intvec_t const& x) const
- {
- return x < *this;
- }
- boolvec_t operator>=(intvec_t const& x) const
- {
- return ! (*this < x);
- }
-
- intvec_t abs() const;
- boolvec_t isignbit() const { return as_bool(); }
- intvec_t max(intvec_t x) const;
- intvec_t min(intvec_t x) const;
- };
-
-
-
- template<>
- struct realvec<double,4>: floatprops<double>
- {
- static int const size = 4;
- typedef real_t scalar_t;
- typedef __m256d vector_t;
- static int const alignment = sizeof(vector_t);
-
- static char const* name() {
+ return r;
+#endif
+ }
+ boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+ boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+ boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+ intvec_t abs() const;
+ boolvec_t isignbit() const { return as_bool(); }
+ intvec_t max(intvec_t x) const;
+ intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 4> : floatprops<double> {
+ static int const size = 4;
+ typedef real_t scalar_t;
+ typedef __m256d vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static char const *name() {
#ifdef __AVX2__
- return "<AVX2:4*double>";
+ return "<AVX2:4*double>";
#else
- return "<AVX:4*double>";
+ return "<AVX:4*double>";
#endif
+ }
+ void barrier() { __asm__("" : "+x"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x) : v(x) {}
+ realvec(real_t a) : v(_mm256_set1_pd(a)) {}
+ realvec(real_t const *as) : v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {}
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const {
+ return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+ }
+ realvec_t &set_elt(int n, real_t a) {
+ return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+ }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return _mm256_load_pd(p);
+ }
+ static realvec_t loadu(real_t const *p) { return _mm256_loadu_pd(p); }
+ static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
}
- void barrier() { __asm__("": "+x"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(realvec const& x): v(x.v) {}
- // realvec& operator=(realvec const& x) { return v=x.v, *this; }
- realvec(vector_t x): v(x) {}
- realvec(real_t a): v(_mm256_set1_pd(a)) {}
- realvec(real_t const* as): v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {}
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const
- {
- return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
- }
- realvec_t& set_elt(int n, real_t a)
- {
- return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
- }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return _mm256_load_pd(p);
- }
- static realvec_t loadu(real_t const* p)
- {
- return _mm256_loadu_pd(p);
- }
- static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff);
- return loadu(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return m.m.ifthen(loada(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return m.m.ifthen(loadu(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff, m);
- return loadu(p+ioff, m);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- _mm256_store_pd(p, v);
- }
- void storeu(real_t* p) const
- {
- return _mm256_storeu_pd(p, v);
- }
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff);
- storeu(p+ioff);
- }
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- } else {
- _mm256_maskstore_pd(p, m.m.as_int(), v);
- }
- }
- void storeu(real_t* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- } else {
- for (int d=0; d<size; ++d) {
- if (m.m[d]) p[d] = (*this)[d];
- }
- }
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
}
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff, m);
- storeu(p+ioff, m);
+ }
+ realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff, m);
+ return loadu(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ _mm256_store_pd(p, v);
+ }
+ void storeu(real_t *p) const { return _mm256_storeu_pd(p, v); }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ _mm256_maskstore_pd(p, m.m.as_int(), v);
}
-
-
-
- intvec_t as_int() const { return _mm256_castpd_si256(v); }
- intvec_t convert_int() const
- {
- intvec_t r;
- for (int d=0; d<size; ++d) {
- r.set_elt(d, floatprops::convert_int((*this)[d]));
+ }
+ void storeu(real_t *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ for (int d = 0; d < size; ++d) {
+ if (m.m[d])
+ p[d] = (*this)[d];
}
- return r;
- }
-
-
-
- realvec_t operator+() const { return *this; }
- realvec_t operator-() const { return RV(0.0) - *this; }
-
- realvec_t operator+(realvec_t x) const { return _mm256_add_pd(v, x.v); }
- realvec_t operator-(realvec_t x) const { return _mm256_sub_pd(v, x.v); }
- realvec_t operator*(realvec_t x) const { return _mm256_mul_pd(v, x.v); }
- realvec_t operator/(realvec_t x) const { return _mm256_div_pd(v, x.v); }
-
- realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
- realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
- realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
- realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-
- real_t maxval() const
- {
- // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
- // vml_std::fmax((*this)[2], (*this)[3]));
- realvec_t x0123 = *this;
- realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
- realvec_t y0022 = x0123.fmax(x1032);
- return vml_std::fmax(y0022[0], y0022[2]);
- }
- real_t minval() const
- {
- // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
- // vml_std::fmin((*this)[2], (*this)[3]));
- realvec_t x0123 = *this;
- realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
- realvec_t y0022 = x0123.fmin(x1032);
- return vml_std::fmin(y0022[0], y0022[2]);
- }
- real_t prod() const
- {
- // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
- realvec_t x0123 = *this;
- realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
- realvec_t y0022 = x0123 * x1032;
- return y0022[0] * y0022[2];
- }
- real_t sum() const
- {
- // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
- // __m256d x = _mm256_hadd_pd(v, v);
- // __m128d xlo = _mm256_extractf128_pd(x, 0);
- // __m128d xhi = _mm256_extractf128_pd(x, 1);
- realvec_t x = *this;
- x = _mm256_hadd_pd(x.v, x.v);
- return x[0] + x[2];
- }
-
-
-
- boolvec_t operator==(realvec_t const& x) const
- {
- return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ);
- }
- boolvec_t operator!=(realvec_t const& x) const
- {
- return _mm256_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
- }
- boolvec_t operator<(realvec_t const& x) const
- {
- return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ);
- }
- boolvec_t operator<=(realvec_t const& x) const
- {
- return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ);
- }
- boolvec_t operator>(realvec_t const& x) const
- {
- return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ);
- }
- boolvec_t operator>=(realvec_t const& x) const
- {
- return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ);
- }
-
-
-
- realvec_t acos() const { return MF::vml_acos(*this); }
- realvec_t acosh() const { return MF::vml_acosh(*this); }
- realvec_t asin() const { return MF::vml_asin(*this); }
- realvec_t asinh() const { return MF::vml_asinh(*this); }
- realvec_t atan() const { return MF::vml_atan(*this); }
- realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
- realvec_t atanh() const { return MF::vml_atanh(*this); }
- realvec_t cbrt() const { return MF::vml_cbrt(*this); }
- realvec_t ceil() const { return _mm256_ceil_pd(v); }
- realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
- realvec_t cos() const { return MF::vml_cos(*this); }
- realvec_t cosh() const { return MF::vml_cosh(*this); }
- realvec_t exp() const { return MF::vml_exp(*this); }
- realvec_t exp10() const { return MF::vml_exp10(*this); }
- realvec_t exp2() const { return MF::vml_exp2(*this); }
- realvec_t expm1() const { return MF::vml_expm1(*this); }
- realvec_t fabs() const { return MF::vml_fabs(*this); }
- realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
- realvec_t floor() const { return _mm256_floor_pd(v); }
- realvec_t fma(realvec_t y, realvec_t z) const
- {
- return MF::vml_fma(*this, y, z);
}
- realvec_t fmax(realvec_t y) const { return _mm256_max_pd(v, y.v); }
- realvec_t fmin(realvec_t y) const { return _mm256_min_pd(v, y.v); }
- realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
- realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
- realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
- intvec_t ilogb() const { return MF::vml_ilogb(*this); }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const
- {
-#ifdef VML_HAVE_NAN
- return _mm256_cmp_pd(v, v, _CMP_UNORD_Q);
-#else
- return BV(false);
-#endif
- }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
- realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
- realvec_t log() const { return MF::vml_log(*this); }
- realvec_t log10() const { return MF::vml_log10(*this); }
- realvec_t log1p() const { return MF::vml_log1p(*this); }
- realvec_t log2() const { return MF::vml_log2(*this); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return MF::vml_mad(*this, y, z);
- }
- realvec_t nextafter(realvec_t y) const
- {
- return MF::vml_nextafter(*this, y);
- }
- realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
- realvec_t rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); }
- realvec_t remainder(realvec_t y) const
- {
- return MF::vml_remainder(*this, y);
- }
- realvec_t rint() const
- {
- return _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
- }
- realvec_t round() const { return MF::vml_round(*this); }
- realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
- boolvec_t signbit() const { return v; }
- realvec_t sin() const { return MF::vml_sin(*this); }
- realvec_t sinh() const { return MF::vml_sinh(*this); }
- realvec_t sqrt() const { return _mm256_sqrt_pd(v); }
- realvec_t tan() const { return MF::vml_tan(*this); }
- realvec_t tanh() const { return MF::vml_tanh(*this); }
- realvec_t trunc() const { return _mm256_round_pd(v, _MM_FROUND_TO_ZERO); }
- };
-
-
-
- // boolvec definitions
-
- inline intvec<double,4> boolvec<double,4>::as_int() const
- {
- return _mm256_castpd_si256(v);
- }
-
- inline intvec<double,4> boolvec<double,4>::convert_int() const
- {
- //return ifthen(v, U(1), U(0));
- return lsr(as_int(), bits-1);
- }
-
- inline
- boolvec<double,4> boolvec<double,4>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return ifthen(x.as_int(), y.as_int()).as_bool();
- }
-
- inline
- intvec<double,4> boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const
- {
- return ifthen(x.as_float(), y.as_float()).as_int();
- }
-
- inline
- realvec<double,4> boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const
- {
- return _mm256_blendv_pd(y.v, x.v, v);
- }
-
-
-
- // intvec definitions
-
- inline intvec<double,4> intvec<double,4>::abs() const
- {
- return MF::vml_abs(*this);
- }
-
- inline
- intvec<double,4> intvec<double,4>::bitifthen(intvec_t x, intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
- inline intvec<double,4> intvec<double,4>::clz() const
- {
- return MF::vml_clz(*this);
- }
-
- inline realvec<double,4> intvec<double,4>::as_float() const
- {
- return _mm256_castsi256_pd(v);
- }
-
- inline realvec<double,4> intvec<double,4>::convert_float() const
- {
- realvec_t r;
- for (int d=0; d<size; ++d) {
- r.set_elt(d, floatprops::convert_float((*this)[d]));
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff, m);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return _mm256_castpd_si256(v); }
+ intvec_t convert_int() const {
+ intvec_t r;
+ for (int d = 0; d < size; ++d) {
+ r.set_elt(d, floatprops::convert_int((*this)[d]));
}
return r;
}
-
- inline intvec<double,4> intvec<double,4>::max(intvec_t x) const
- {
- return MF::vml_max(*this, x);
+
+ realvec_t operator+() const { return *this; }
+ realvec_t operator-() const { return RV(0.0) - *this; }
+
+ realvec_t operator+(realvec_t x) const { return _mm256_add_pd(v, x.v); }
+ realvec_t operator-(realvec_t x) const { return _mm256_sub_pd(v, x.v); }
+ realvec_t operator*(realvec_t x) const { return _mm256_mul_pd(v, x.v); }
+ realvec_t operator/(realvec_t x) const { return _mm256_div_pd(v, x.v); }
+
+ realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+ realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+ realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+ realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+ real_t maxval() const {
+ // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+ // vml_std::fmax((*this)[2], (*this)[3]));
+ realvec_t x0123 = *this;
+ realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
+ realvec_t y0022 = x0123.fmax(x1032);
+ return vml_std::fmax(y0022[0], y0022[2]);
+ }
+ real_t minval() const {
+ // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+ // vml_std::fmin((*this)[2], (*this)[3]));
+ realvec_t x0123 = *this;
+ realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
+ realvec_t y0022 = x0123.fmin(x1032);
+ return vml_std::fmin(y0022[0], y0022[2]);
+ }
+ real_t prod() const {
+ // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+ realvec_t x0123 = *this;
+ realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101);
+ realvec_t y0022 = x0123 * x1032;
+ return y0022[0] * y0022[2];
+ }
+ real_t sum() const {
+ // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+ // __m256d x = _mm256_hadd_pd(v, v);
+ // __m128d xlo = _mm256_extractf128_pd(x, 0);
+ // __m128d xhi = _mm256_extractf128_pd(x, 1);
+ realvec_t x = *this;
+ x = _mm256_hadd_pd(x.v, x.v);
+ return x[0] + x[2];
+ }
+
+ boolvec_t operator==(realvec_t const &x) const {
+ return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ);
+ }
+ boolvec_t operator!=(realvec_t const &x) const {
+ return _mm256_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
+ }
+ boolvec_t operator<(realvec_t const &x) const {
+ return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ);
+ }
+ boolvec_t operator<=(realvec_t const &x) const {
+ return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ);
+ }
+ boolvec_t operator>(realvec_t const &x) const {
+ return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ);
}
-
- inline intvec<double,4> intvec<double,4>::min(intvec_t x) const
- {
- return MF::vml_min(*this, x);
+ boolvec_t operator>=(realvec_t const &x) const {
+ return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ);
}
-
- inline intvec<double,4> intvec<double,4>::popcount() const
- {
- return MF::vml_popcount(*this);
+
+ realvec_t acos() const { return MF::vml_acos(*this); }
+ realvec_t acosh() const { return MF::vml_acosh(*this); }
+ realvec_t asin() const { return MF::vml_asin(*this); }
+ realvec_t asinh() const { return MF::vml_asinh(*this); }
+ realvec_t atan() const { return MF::vml_atan(*this); }
+ realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+ realvec_t atanh() const { return MF::vml_atanh(*this); }
+ realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+ realvec_t ceil() const { return _mm256_ceil_pd(v); }
+ realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+ realvec_t cos() const { return MF::vml_cos(*this); }
+ realvec_t cosh() const { return MF::vml_cosh(*this); }
+ realvec_t exp() const { return MF::vml_exp(*this); }
+ realvec_t exp10() const { return MF::vml_exp10(*this); }
+ realvec_t exp2() const { return MF::vml_exp2(*this); }
+ realvec_t expm1() const { return MF::vml_expm1(*this); }
+ realvec_t fabs() const { return MF::vml_fabs(*this); }
+ realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+ realvec_t floor() const { return _mm256_floor_pd(v); }
+ realvec_t fma(realvec_t y, realvec_t z) const {
+ return MF::vml_fma(*this, y, z);
+ }
+ realvec_t fmax(realvec_t y) const { return _mm256_max_pd(v, y.v); }
+ realvec_t fmin(realvec_t y) const { return _mm256_min_pd(v, y.v); }
+ realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+ realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+ realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const {
+#ifdef VML_HAVE_NAN
+ return _mm256_cmp_pd(v, v, _CMP_UNORD_Q);
+#else
+ return BV(false);
+#endif
+ }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec_t log() const { return MF::vml_log(*this); }
+ realvec_t log10() const { return MF::vml_log10(*this); }
+ realvec_t log1p() const { return MF::vml_log1p(*this); }
+ realvec_t log2() const { return MF::vml_log2(*this); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return MF::vml_mad(*this, y, z);
}
-
- inline intvec<double,4> intvec<double,4>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
+ realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+ realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+ realvec_t rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); }
+ realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+ realvec_t rint() const {
+ return _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
}
-
- inline intvec<double,4> intvec<double,4>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
+ realvec_t round() const { return MF::vml_round(*this); }
+ realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+ boolvec_t signbit() const { return v; }
+ realvec_t sin() const { return MF::vml_sin(*this); }
+ realvec_t sinh() const { return MF::vml_sinh(*this); }
+ realvec_t sqrt() const { return _mm256_sqrt_pd(v); }
+ realvec_t tan() const { return MF::vml_tan(*this); }
+ realvec_t tanh() const { return MF::vml_tanh(*this); }
+ realvec_t trunc() const { return _mm256_round_pd(v, _MM_FROUND_TO_ZERO); }
+};
+
+// boolvec definitions
+
+inline intvec<double, 4> boolvec<double, 4>::as_int() const {
+ return _mm256_castpd_si256(v);
+}
+
+inline intvec<double, 4> boolvec<double, 4>::convert_int() const {
+ // return ifthen(v, U(1), U(0));
+ return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<double, 4> boolvec<double, 4>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<double, 4> boolvec<double, 4>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<double, 4> boolvec<double, 4>::ifthen(realvec_t x,
+ realvec_t y) const {
+ return _mm256_blendv_pd(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline intvec<double, 4> intvec<double, 4>::abs() const {
+ return MF::vml_abs(*this);
+}
+
+inline intvec<double, 4> intvec<double, 4>::bitifthen(intvec_t x,
+ intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 4> intvec<double, 4>::clz() const {
+ return MF::vml_clz(*this);
+}
+
+inline realvec<double, 4> intvec<double, 4>::as_float() const {
+ return _mm256_castsi256_pd(v);
+}
+
+inline realvec<double, 4> intvec<double, 4>::convert_float() const {
+ realvec_t r;
+ for (int d = 0; d < size; ++d) {
+ r.set_elt(d, floatprops::convert_float((*this)[d]));
}
-
+ return r;
+}
+
+inline intvec<double, 4> intvec<double, 4>::max(intvec_t x) const {
+ return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 4> intvec<double, 4>::min(intvec_t x) const {
+ return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 4> intvec<double, 4>::popcount() const {
+ return MF::vml_popcount(*this);
+}
+
+inline intvec<double, 4> intvec<double, 4>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 4> intvec<double, 4>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_AVX_DOUBLE4_H
+#endif // #ifndef VEC_AVX_DOUBLE4_H
diff --git a/vec_avx_float8.h b/vec_avx_float8.h
index ec1e132..f119aee 100644
--- a/vec_avx_float8.h
+++ b/vec_avx_float8.h
@@ -12,828 +12,697 @@
// AVX intrinsics
#include <immintrin.h>
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_FLOAT_8
- template<> struct boolvec<float,8>;
- template<> struct intvec<float,8>;
- template<> struct realvec<float,8>;
-
-
-
- template<>
- struct boolvec<float,8>: floatprops<float>
- {
- static int const size = 8;
- typedef bool scalar_t;
- typedef __m256 bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- private:
- // true values have the sign bit set, false values have it unset
- static uint_t from_bool(bool a) { return - uint_t(a); }
- static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
- public:
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(boolvec const& x): v(x.v) {}
- // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a):
- v(_mm256_castsi256_ps(_mm256_set1_epi32(from_bool(a)))) {}
- boolvec(bool const* as):
- v(_mm256_castsi256_ps(_mm256_set_epi32(from_bool(as[7]),
- from_bool(as[6]),
- from_bool(as[5]),
- from_bool(as[4]),
- from_bool(as[3]),
- from_bool(as[2]),
- from_bool(as[1]),
- from_bool(as[0])))) {}
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const
- {
- return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
- }
- boolvec_t& set_elt(int n, bool a)
- {
- return
- vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
- }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec_t operator!() const { return _mm256_xor_ps(boolvec(true), v); }
-
- boolvec_t operator&&(boolvec_t x) const { return _mm256_and_ps(v, x.v); }
- boolvec_t operator||(boolvec_t x) const { return _mm256_or_ps(v, x.v); }
- boolvec_t operator==(boolvec_t x) const { return !(*this!=x); }
- boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_ps(v, x.v); }
-
- bool all() const
- {
- // return
- // (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] &&
- // (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7];
- return ! (! *this).any();
- }
- bool any() const
- {
- // return
- // (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] ||
- // (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7];
- return ! bool(_mm256_testz_ps(v, v));
- }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<float,8>: floatprops<float>
- {
- static int const size = 8;
- typedef int_t scalar_t;
- typedef __m256i ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(intvec const& x): v(x.v) {}
- // intvec& operator=(intvec const& x) { return v=x.v, *this; }
- intvec(ivector_t x): v(x) {}
- intvec(int_t a): v(_mm256_set1_epi32(a)) {}
- intvec(int_t const* as): v(_mm256_set_epi32(as[7], as[6], as[5], as[4],
- as[3], as[2], as[1], as[0])) {}
- static intvec_t iota() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const
- {
- return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
- }
- intvec_t& set_elt(int n, int_t a)
- {
- return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
- }
-
-
-
- boolvec_t as_bool() const { return _mm256_castsi256_ps(v); }
- boolvec_t convert_bool() const
- {
- // Result: convert_bool(0)=false, convert_bool(else)=true
+template <> struct boolvec<float, 8>;
+template <> struct intvec<float, 8>;
+template <> struct realvec<float, 8>;
+
+template <> struct boolvec<float, 8> : floatprops<float> {
+ static int const size = 8;
+ typedef bool scalar_t;
+ typedef __m256 bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+private:
+ // true values have the sign bit set, false values have it unset
+ static uint_t from_bool(bool a) { return -uint_t(a); }
+ static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v(_mm256_castsi256_ps(_mm256_set1_epi32(from_bool(a)))) {}
+ boolvec(bool const *as)
+ : v(_mm256_castsi256_ps(_mm256_set_epi32(
+ from_bool(as[7]), from_bool(as[6]), from_bool(as[5]),
+ from_bool(as[4]), from_bool(as[3]), from_bool(as[2]),
+ from_bool(as[1]), from_bool(as[0])))) {}
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const {
+ return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+ }
+ boolvec_t &set_elt(int n, bool a) {
+ return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+ *this;
+ }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec_t operator!() const { return _mm256_xor_ps(boolvec(true), v); }
+
+ boolvec_t operator&&(boolvec_t x) const { return _mm256_and_ps(v, x.v); }
+ boolvec_t operator||(boolvec_t x) const { return _mm256_or_ps(v, x.v); }
+ boolvec_t operator==(boolvec_t x) const { return !(*this != x); }
+ boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_ps(v, x.v); }
+
+ bool all() const {
+ // return
+ // (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] &&
+ // (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7];
+ return !(!*this).any();
+ }
+ bool any() const {
+ // return
+ // (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] ||
+ // (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7];
+ return !bool(_mm256_testz_ps(v, v));
+ }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 8> : floatprops<float> {
+ static int const size = 8;
+ typedef int_t scalar_t;
+ typedef __m256i ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x) : v(x) {}
+ intvec(int_t a) : v(_mm256_set1_epi32(a)) {}
+ intvec(int_t const *as)
+ : v(_mm256_set_epi32(as[7], as[6], as[5], as[4], as[3], as[2], as[1],
+ as[0])) {}
+ static intvec_t iota() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const {
+ return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+ }
+ intvec_t &set_elt(int n, int_t a) {
+ return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+ }
+
+ boolvec_t as_bool() const { return _mm256_castsi256_ps(v); }
+ boolvec_t convert_bool() const {
+// Result: convert_bool(0)=false, convert_bool(else)=true
#ifdef __AVX2__
- return *this != IV(I(0));
+ return *this != IV(I(0));
#else
- // There is no intrinsic to compare to zero. Instead, we check
- // whether x is positive and x-1 is negative.
- intvec_t x = *this;
- // We know that boolvec_t values depend only on the sign bit
- // return (~(x-1) | x).as_bool();
- // return x.as_bool() || !(x-1).as_bool();
- return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+ // There is no intrinsic to compare to zero. Instead, we check
+ // whether x is positive and x-1 is negative.
+ intvec_t x = *this;
+ // We know that boolvec_t values depend only on the sign bit
+ // return (~(x-1) | x).as_bool();
+ // return x.as_bool() || !(x-1).as_bool();
+ return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
#endif
- }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- // Note: not all arithmetic operations are supported!
-
- intvec_t operator+() const { return *this; }
- intvec_t operator-() const { return IV(0) - *this; }
-
- intvec_t operator+(intvec_t x) const
- {
+ }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ // Note: not all arithmetic operations are supported!
+
+ intvec_t operator+() const { return *this; }
+ intvec_t operator-() const { return IV(0) - *this; }
+
+ intvec_t operator+(intvec_t x) const {
#ifdef __AVX2__
- return _mm256_add_epi32(v, x.v);
+ return _mm256_add_epi32(v, x.v);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- __m128i xvlo = _mm256_castsi256_si128(x.v);
- __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
- vlo = _mm_add_epi32(vlo, xvlo);
- vhi = _mm_add_epi32(vhi, xvhi);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ __m128i xvlo = _mm256_castsi256_si128(x.v);
+ __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+ vlo = _mm_add_epi32(vlo, xvlo);
+ vhi = _mm_add_epi32(vhi, xvhi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec_t operator-(intvec_t x) const
- {
+ }
+ intvec_t operator-(intvec_t x) const {
#ifdef __AVX2__
- return _mm256_sub_epi32(v, x.v);
+ return _mm256_sub_epi32(v, x.v);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- __m128i xvlo = _mm256_castsi256_si128(x.v);
- __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
- vlo = _mm_sub_epi32(vlo, xvlo);
- vhi = _mm_sub_epi32(vhi, xvhi);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ __m128i xvlo = _mm256_castsi256_si128(x.v);
+ __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+ vlo = _mm_sub_epi32(vlo, xvlo);
+ vhi = _mm_sub_epi32(vhi, xvhi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
-
- intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
- intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-
-
-
- intvec_t operator~() const { return IV(~U(0)) ^ *this; }
-
- intvec_t operator&(intvec_t x) const
- {
+ }
+
+ intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+ intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+
+ intvec_t operator~() const { return IV(~U(0)) ^ *this; }
+
+ intvec_t operator&(intvec_t x) const {
#ifdef __AVX2__
- return _mm256_and_si256(v, x.v);
+ return _mm256_and_si256(v, x.v);
#else
- return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
+ return _mm256_castps_si256(
+ _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
#endif
- }
- intvec_t operator|(intvec_t x) const
- {
+ }
+ intvec_t operator|(intvec_t x) const {
#ifdef __AVX2__
- return _mm256_or_si256(v, x.v);
+ return _mm256_or_si256(v, x.v);
#else
- return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
+ return _mm256_castps_si256(
+ _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
#endif
- }
- intvec_t operator^(intvec_t x) const
- {
+ }
+ intvec_t operator^(intvec_t x) const {
#ifdef __AVX2__
- return _mm256_xor_si256(v, x.v);
+ return _mm256_xor_si256(v, x.v);
#else
- return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
+ return _mm256_castps_si256(
+ _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
#endif
- }
-
- intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
- intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
- intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const;
-
-
-
- intvec_t lsr(int_t n) const
- {
+ }
+
+ intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+ intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+ intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+ intvec_t lsr(int_t n) const {
#ifdef __AVX2__
- return _mm256_srli_epi32(v, n);
+ return _mm256_srli_epi32(v, n);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- vlo = _mm_srli_epi32(vlo, n);
- vhi = _mm_srli_epi32(vhi, n);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ vlo = _mm_srli_epi32(vlo, n);
+ vhi = _mm_srli_epi32(vhi, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec_t rotate(int_t n) const;
- intvec_t operator>>(int_t n) const
- {
+ }
+ intvec_t rotate(int_t n) const;
+ intvec_t operator>>(int_t n) const {
#ifdef __AVX2__
- return _mm256_srai_epi32(v, n);
+ return _mm256_srai_epi32(v, n);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- vlo = _mm_srai_epi32(vlo, n);
- vhi = _mm_srai_epi32(vhi, n);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ vlo = _mm_srai_epi32(vlo, n);
+ vhi = _mm_srai_epi32(vhi, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec_t operator<<(int_t n) const
- {
+ }
+ intvec_t operator<<(int_t n) const {
#ifdef __AVX2__
- return _mm256_slli_epi32(v, n);
+ return _mm256_slli_epi32(v, n);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- vlo = _mm_slli_epi32(vlo, n);
- vhi = _mm_slli_epi32(vhi, n);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ vlo = _mm_slli_epi32(vlo, n);
+ vhi = _mm_slli_epi32(vhi, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec_t lsr(intvec_t n) const
- {
+ }
+ intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec_t lsr(intvec_t n) const {
#ifdef __AVX2__
- return _mm256_srlv_epi32(v, n.v);
+ return _mm256_srlv_epi32(v, n.v);
#else
- intvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, U((*this)[i]) >> U(n[i]));
- }
- return r;
-#endif
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, U((*this)[i]) >> U(n[i]));
}
- intvec_t rotate(intvec_t n) const;
- intvec_t operator>>(intvec_t n) const
- {
+ return r;
+#endif
+ }
+ intvec_t rotate(intvec_t n) const;
+ intvec_t operator>>(intvec_t n) const {
#ifdef __AVX2__
- return _mm256_srav_epi32(v, n.v);
+ return _mm256_srav_epi32(v, n.v);
#else
- intvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] >> n[i]);
- }
- return r;
-#endif
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] >> n[i]);
}
- intvec_t operator<<(intvec_t n) const
- {
+ return r;
+#endif
+ }
+ intvec_t operator<<(intvec_t n) const {
#ifdef __AVX2__
- return _mm256_sllv_epi32(v, n.v);
+ return _mm256_sllv_epi32(v, n.v);
#else
- intvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] << n[i]);
- }
- return r;
-#endif
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] << n[i]);
}
- intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-
- intvec_t clz() const;
- intvec_t popcount() const;
-
-
-
- boolvec_t operator==(intvec_t const& x) const
- {
+ return r;
+#endif
+ }
+ intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+ intvec_t clz() const;
+ intvec_t popcount() const;
+
+ boolvec_t operator==(intvec_t const &x) const {
#ifdef __AVX2__
- return _mm256_castsi256_ps(_mm256_cmpeq_epi32(v, x.v));
+ return _mm256_castsi256_ps(_mm256_cmpeq_epi32(v, x.v));
#else
- return ! (*this != x);
+ return !(*this != x);
#endif
- }
- boolvec_t operator!=(intvec_t const& x) const
- {
+ }
+ boolvec_t operator!=(intvec_t const &x) const {
#ifdef __AVX2__
- return ! (*this == x);
+ return !(*this == x);
#else
- return (*this ^ x).convert_bool();
+ return (*this ^ x).convert_bool();
#endif
- }
- boolvec_t operator<(intvec_t const& x) const
- {
+ }
+ boolvec_t operator<(intvec_t const &x) const {
#ifdef __AVX2__
- return _mm256_castsi256_ps(_mm256_cmpgt_epi32(x.v, v));
+ return _mm256_castsi256_ps(_mm256_cmpgt_epi32(x.v, v));
#else
- // return (*this - x).as_bool();
- boolvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] < x[i]);
- }
- return r;
-#endif
+ // return (*this - x).as_bool();
+ boolvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] < x[i]);
}
- boolvec_t operator<=(intvec_t const& x) const
- {
- return ! (*this > x);
- }
- boolvec_t operator>(intvec_t const& x) const
- {
- return x < *this;
- }
- boolvec_t operator>=(intvec_t const& x) const
- {
- return ! (*this < x);
- }
-
- intvec_t abs() const;
- boolvec_t isignbit() const { return as_bool(); }
- intvec_t max(intvec_t x) const;
- intvec_t min(intvec_t x) const;
- };
-
-
-
- template<>
- struct realvec<float,8>: floatprops<float>
- {
- static int const size = 8;
- typedef real_t scalar_t;
- typedef __m256 vector_t;
- static int const alignment = sizeof(vector_t);
-
- static char const* name() {
+ return r;
+#endif
+ }
+ boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+ boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+ boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+ intvec_t abs() const;
+ boolvec_t isignbit() const { return as_bool(); }
+ intvec_t max(intvec_t x) const;
+ intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<float, 8> : floatprops<float> {
+ static int const size = 8;
+ typedef real_t scalar_t;
+ typedef __m256 vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static char const *name() {
#ifdef __AVX2__
- return "<AVX2:8*float>";
+ return "<AVX2:8*float>";
#else
- return "<AVX:8*float>";
+ return "<AVX:8*float>";
#endif
+ }
+ void barrier() { __asm__("" : "+x"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x) : v(x) {}
+ realvec(real_t a) : v(_mm256_set1_ps(a)) {}
+ realvec(real_t const *as)
+ : v(_mm256_set_ps(as[7], as[6], as[5], as[4], as[3], as[2], as[1],
+ as[0])) {}
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const {
+ return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+ }
+ realvec_t &set_elt(int n, real_t a) {
+ return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+ }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return _mm256_load_ps(p);
+ }
+ static realvec_t loadu(real_t const *p) { return _mm256_loadu_ps(p); }
+ static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
}
- void barrier() { __asm__("": "+x"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(realvec const& x): v(x.v) {}
- // realvec& operator=(realvec const& x) { return v=x.v, *this; }
- realvec(vector_t x): v(x) {}
- realvec(real_t a): v(_mm256_set1_ps(a)) {}
- realvec(real_t const* as): v(_mm256_set_ps(as[7], as[6], as[5], as[4],
- as[3], as[2], as[1], as[0])) {}
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const
- {
- return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
- }
- realvec_t& set_elt(int n, real_t a)
- {
- return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
- }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return _mm256_load_ps(p);
- }
- static realvec_t loadu(real_t const* p)
- {
- return _mm256_loadu_ps(p);
- }
- static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff);
- return loadu(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return m.m.ifthen(loada(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return m.m.ifthen(loadu(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff, m);
- return loadu(p+ioff, m);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- _mm256_store_ps(p, v);
- }
- void storeu(real_t* p) const
- {
- return _mm256_storeu_ps(p, v);
- }
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff);
- storeu(p+ioff);
- }
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- } else {
- _mm256_maskstore_ps(p, m.m.as_int(), v);
- }
- }
- void storeu(real_t* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- } else {
- // TODO: this is expensive
- for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
- }
- }
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff, m);
- storeu(p+ioff, m);
- }
-
-
-
- intvec_t as_int() const { return _mm256_castps_si256(v); }
- intvec_t convert_int() const { return _mm256_cvttps_epi32(v); }
-
-
-
- realvec_t operator+() const { return *this; }
- realvec_t operator-() const { return RV(0.0) - *this; }
-
- realvec_t operator+(realvec_t x) const { return _mm256_add_ps(v, x.v); }
- realvec_t operator-(realvec_t x) const { return _mm256_sub_ps(v, x.v); }
- realvec_t operator*(realvec_t x) const { return _mm256_mul_ps(v, x.v); }
- realvec_t operator/(realvec_t x) const { return _mm256_div_ps(v, x.v); }
-
- realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
- realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
- realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
- realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-
- real_t maxval() const
- {
- // return
- // vml_std::fmax(vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
- // vml_std::fmax((*this)[2], (*this)[3])),
- // vml_std::fmax(vml_std::fmax((*this)[4], (*this)[5]),
- // vml_std::fmax((*this)[6], (*this)[7])));
- realvec_t x01234567 = *this;
- realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
- realvec_t y00224466 = x01234567.fmax(x10325476);
- realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
- realvec_t z00004444 = y00224466.fmax(y22006644);
- return vml_std::fmax(z00004444[0], z00004444[4]);
- }
- real_t minval() const
- {
- // return
- // vml_std::fmin(vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
- // vml_std::fmin((*this)[2], (*this)[3])),
- // vml_std::fmin(vml_std::fmin((*this)[4], (*this)[5]),
- // vml_std::fmin((*this)[6], (*this)[7])));
- realvec_t x01234567 = *this;
- realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
- realvec_t y00224466 = x01234567.fmin(x10325476);
- realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
- realvec_t z00004444 = y00224466.fmin(y22006644);
- return vml_std::fmin(z00004444[0], z00004444[4]);
- }
- real_t prod() const
- {
- // return
- // (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] *
- // (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7];
- realvec_t x01234567 = *this;
- realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
- realvec_t y00224466 = x01234567 * x10325476;
- realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
- realvec_t z00004444 = y00224466 * y22006644;
- return z00004444[0] * z00004444[4];
- }
- real_t sum() const
- {
- // return
- // (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3] +
- // (*this)[4] + (*this)[5] + (*this)[6] + (*this)[7];
- // _m256 x = vhaddps(v, v);
- // x = vhaddps(x, x);
- // __m128 xlo = _mm256_extractf128_ps(x, 0);
- // __m128 xhi = _mm256_extractf128_ps(x, 1);
- // return _mm_cvtsd_f64(xlo) + _mm_cvtsd_f64(xhi);
- realvec_t x = *this;
- x = _mm256_hadd_ps(x.v, x.v);
- x = _mm256_hadd_ps(x.v, x.v);
- return x[0] + x[4];
- }
-
-
-
- boolvec_t operator==(realvec_t const& x) const
- {
- return _mm256_cmp_ps(v, x.v, _CMP_EQ_OQ);
- }
- boolvec_t operator!=(realvec_t const& x) const
- {
- return _mm256_cmp_ps(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
- }
- boolvec_t operator<(realvec_t const& x) const
- {
- return _mm256_cmp_ps(v, x.v, _CMP_LT_OQ);
- }
- boolvec_t operator<=(realvec_t const& x) const
- {
- return _mm256_cmp_ps(v, x.v, _CMP_LE_OQ);
- }
- boolvec_t operator>(realvec_t const& x) const
- {
- return _mm256_cmp_ps(v, x.v, _CMP_GT_OQ);
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
}
- boolvec_t operator>=(realvec_t const& x) const
- {
- return _mm256_cmp_ps(v, x.v, _CMP_GE_OQ);
+ }
+ realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff, m);
+ return loadu(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ _mm256_store_ps(p, v);
+ }
+ void storeu(real_t *p) const { return _mm256_storeu_ps(p, v); }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ _mm256_maskstore_ps(p, m.m.as_int(), v);
}
-
-
-
- realvec_t acos() const { return MF::vml_acos(*this); }
- realvec_t acosh() const { return MF::vml_acosh(*this); }
- realvec_t asin() const { return MF::vml_asin(*this); }
- realvec_t asinh() const { return MF::vml_asinh(*this); }
- realvec_t atan() const { return MF::vml_atan(*this); }
- realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
- realvec_t atanh() const { return MF::vml_atanh(*this); }
- realvec_t cbrt() const { return MF::vml_cbrt(*this); }
- realvec_t ceil() const { return _mm256_ceil_ps(v); }
- realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
- realvec_t cos() const { return MF::vml_cos(*this); }
- realvec_t cosh() const { return MF::vml_cosh(*this); }
- realvec_t exp() const { return MF::vml_exp(*this); }
- realvec_t exp10() const { return MF::vml_exp10(*this); }
- realvec_t exp2() const { return MF::vml_exp2(*this); }
- realvec_t expm1() const { return MF::vml_expm1(*this); }
- realvec_t fabs() const { return MF::vml_fabs(*this); }
- realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
- realvec_t floor() const { return _mm256_floor_ps(v); }
- realvec_t fma(realvec_t y, realvec_t z) const
- {
- return MF::vml_fma(*this, y, z);
+ }
+ void storeu(real_t *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ // TODO: this is expensive
+ for (int n = 0; n < size; ++n)
+ if (m.m[n])
+ p[n] = (*this)[n];
}
- realvec_t fmax(realvec_t y) const { return _mm256_max_ps(v, y.v); }
- realvec_t fmin(realvec_t y) const { return _mm256_min_ps(v, y.v); }
- realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
- realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
- realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
- intvec_t ilogb() const { return MF::vml_ilogb(*this); }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const
- {
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff, m);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return _mm256_castps_si256(v); }
+ intvec_t convert_int() const { return _mm256_cvttps_epi32(v); }
+
+ realvec_t operator+() const { return *this; }
+ realvec_t operator-() const { return RV(0.0) - *this; }
+
+ realvec_t operator+(realvec_t x) const { return _mm256_add_ps(v, x.v); }
+ realvec_t operator-(realvec_t x) const { return _mm256_sub_ps(v, x.v); }
+ realvec_t operator*(realvec_t x) const { return _mm256_mul_ps(v, x.v); }
+ realvec_t operator/(realvec_t x) const { return _mm256_div_ps(v, x.v); }
+
+ realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+ realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+ realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+ realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+ real_t maxval() const {
+ // return
+ // vml_std::fmax(vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+ // vml_std::fmax((*this)[2], (*this)[3])),
+ // vml_std::fmax(vml_std::fmax((*this)[4], (*this)[5]),
+ // vml_std::fmax((*this)[6], (*this)[7])));
+ realvec_t x01234567 = *this;
+ realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
+ realvec_t y00224466 = x01234567.fmax(x10325476);
+ realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
+ realvec_t z00004444 = y00224466.fmax(y22006644);
+ return vml_std::fmax(z00004444[0], z00004444[4]);
+ }
+ real_t minval() const {
+ // return
+ // vml_std::fmin(vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+ // vml_std::fmin((*this)[2], (*this)[3])),
+ // vml_std::fmin(vml_std::fmin((*this)[4], (*this)[5]),
+ // vml_std::fmin((*this)[6], (*this)[7])));
+ realvec_t x01234567 = *this;
+ realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
+ realvec_t y00224466 = x01234567.fmin(x10325476);
+ realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
+ realvec_t z00004444 = y00224466.fmin(y22006644);
+ return vml_std::fmin(z00004444[0], z00004444[4]);
+ }
+ real_t prod() const {
+ // return
+ // (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] *
+ // (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7];
+ realvec_t x01234567 = *this;
+ realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001);
+ realvec_t y00224466 = x01234567 * x10325476;
+ realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110);
+ realvec_t z00004444 = y00224466 * y22006644;
+ return z00004444[0] * z00004444[4];
+ }
+ real_t sum() const {
+ // return
+ // (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3] +
+ // (*this)[4] + (*this)[5] + (*this)[6] + (*this)[7];
+ // _m256 x = vhaddps(v, v);
+ // x = vhaddps(x, x);
+ // __m128 xlo = _mm256_extractf128_ps(x, 0);
+ // __m128 xhi = _mm256_extractf128_ps(x, 1);
+ // return _mm_cvtsd_f64(xlo) + _mm_cvtsd_f64(xhi);
+ realvec_t x = *this;
+ x = _mm256_hadd_ps(x.v, x.v);
+ x = _mm256_hadd_ps(x.v, x.v);
+ return x[0] + x[4];
+ }
+
+ boolvec_t operator==(realvec_t const &x) const {
+ return _mm256_cmp_ps(v, x.v, _CMP_EQ_OQ);
+ }
+ boolvec_t operator!=(realvec_t const &x) const {
+ return _mm256_cmp_ps(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
+ }
+ boolvec_t operator<(realvec_t const &x) const {
+ return _mm256_cmp_ps(v, x.v, _CMP_LT_OQ);
+ }
+ boolvec_t operator<=(realvec_t const &x) const {
+ return _mm256_cmp_ps(v, x.v, _CMP_LE_OQ);
+ }
+ boolvec_t operator>(realvec_t const &x) const {
+ return _mm256_cmp_ps(v, x.v, _CMP_GT_OQ);
+ }
+ boolvec_t operator>=(realvec_t const &x) const {
+ return _mm256_cmp_ps(v, x.v, _CMP_GE_OQ);
+ }
+
+ realvec_t acos() const { return MF::vml_acos(*this); }
+ realvec_t acosh() const { return MF::vml_acosh(*this); }
+ realvec_t asin() const { return MF::vml_asin(*this); }
+ realvec_t asinh() const { return MF::vml_asinh(*this); }
+ realvec_t atan() const { return MF::vml_atan(*this); }
+ realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+ realvec_t atanh() const { return MF::vml_atanh(*this); }
+ realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+ realvec_t ceil() const { return _mm256_ceil_ps(v); }
+ realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+ realvec_t cos() const { return MF::vml_cos(*this); }
+ realvec_t cosh() const { return MF::vml_cosh(*this); }
+ realvec_t exp() const { return MF::vml_exp(*this); }
+ realvec_t exp10() const { return MF::vml_exp10(*this); }
+ realvec_t exp2() const { return MF::vml_exp2(*this); }
+ realvec_t expm1() const { return MF::vml_expm1(*this); }
+ realvec_t fabs() const { return MF::vml_fabs(*this); }
+ realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+ realvec_t floor() const { return _mm256_floor_ps(v); }
+ realvec_t fma(realvec_t y, realvec_t z) const {
+ return MF::vml_fma(*this, y, z);
+ }
+ realvec_t fmax(realvec_t y) const { return _mm256_max_ps(v, y.v); }
+ realvec_t fmin(realvec_t y) const { return _mm256_min_ps(v, y.v); }
+ realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+ realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+ realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const {
#ifdef VML_HAVE_NAN
- return _mm256_cmp_ps(v, v, _CMP_UNORD_Q);
+ return _mm256_cmp_ps(v, v, _CMP_UNORD_Q);
#else
- return BV(false);
+ return BV(false);
#endif
- }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
- realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
- realvec_t log() const { return MF::vml_log(*this); }
- realvec_t log10() const { return MF::vml_log10(*this); }
- realvec_t log1p() const { return MF::vml_log1p(*this); }
- realvec_t log2() const { return MF::vml_log2(*this); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return MF::vml_mad(*this, y, z);
- }
- realvec_t nextafter(realvec_t y) const
- {
- return MF::vml_nextafter(*this, y);
- }
- realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
- realvec_t rcp() const
- {
- realvec_t x = *this;
- realvec_t r = _mm256_rcp_ps(x); // this is only an approximation
- r *= RV(2.0) - r*x; // one Newton iteration (see vml_rcp)
- return r;
- }
- realvec_t remainder(realvec_t y) const
- {
- return MF::vml_remainder(*this, y);
- }
- realvec_t rint() const
- {
- return _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
- }
- realvec_t round() const { return MF::vml_round(*this); }
- realvec_t rsqrt() const
- {
- realvec_t x = *this;
- realvec_t r = _mm256_rsqrt_ps(x); // this is only an approximation
- r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt)
- return r;
- }
- boolvec_t signbit() const { return v; }
- realvec_t sin() const { return MF::vml_sin(*this); }
- realvec_t sinh() const { return MF::vml_sinh(*this); }
- realvec_t sqrt() const { return _mm256_sqrt_ps(v); }
- realvec_t tan() const { return MF::vml_tan(*this); }
- realvec_t tanh() const { return MF::vml_tanh(*this); }
- realvec_t trunc() const { return _mm256_round_ps(v, _MM_FROUND_TO_ZERO); }
- };
-
-
-
- // boolvec definitions
-
- inline intvec<float,8> boolvec<float,8>::as_int() const
- {
- return _mm256_castps_si256(v);
- }
-
- inline intvec<float,8> boolvec<float,8>::convert_int() const
- {
- return lsr(as_int(), bits-1);
- }
-
- inline
- boolvec<float,8> boolvec<float,8>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return ifthen(x.as_int(), y.as_int()).as_bool();
- }
-
- inline intvec<float,8> boolvec<float,8>::ifthen(intvec_t x, intvec_t y) const
- {
- return ifthen(x.as_float(), y.as_float()).as_int();
- }
-
- inline
- realvec<float,8> boolvec<float,8>::ifthen(realvec_t x, realvec_t y) const
- {
- return _mm256_blendv_ps(y.v, x.v, v);
- }
-
-
-
- // intvec definitions
-
- inline intvec<float,8> intvec<float,8>::abs() const
- {
+ }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec_t log() const { return MF::vml_log(*this); }
+ realvec_t log10() const { return MF::vml_log10(*this); }
+ realvec_t log1p() const { return MF::vml_log1p(*this); }
+ realvec_t log2() const { return MF::vml_log2(*this); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return MF::vml_mad(*this, y, z);
+ }
+ realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+ realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+ realvec_t rcp() const {
+ realvec_t x = *this;
+ realvec_t r = _mm256_rcp_ps(x); // this is only an approximation
+ r *= RV(2.0) - r * x; // one Newton iteration (see vml_rcp)
+ return r;
+ }
+ realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+ realvec_t rint() const {
+ return _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
+ }
+ realvec_t round() const { return MF::vml_round(*this); }
+ realvec_t rsqrt() const {
+ realvec_t x = *this;
+ realvec_t r = _mm256_rsqrt_ps(x); // this is only an approximation
+ r *= RV(1.5) - RV(0.5) * x * r * r; // one Newton iteration (see vml_rsqrt)
+ return r;
+ }
+ boolvec_t signbit() const { return v; }
+ realvec_t sin() const { return MF::vml_sin(*this); }
+ realvec_t sinh() const { return MF::vml_sinh(*this); }
+ realvec_t sqrt() const { return _mm256_sqrt_ps(v); }
+ realvec_t tan() const { return MF::vml_tan(*this); }
+ realvec_t tanh() const { return MF::vml_tanh(*this); }
+ realvec_t trunc() const { return _mm256_round_ps(v, _MM_FROUND_TO_ZERO); }
+};
+
+// boolvec definitions
+
+inline intvec<float, 8> boolvec<float, 8>::as_int() const {
+ return _mm256_castps_si256(v);
+}
+
+inline intvec<float, 8> boolvec<float, 8>::convert_int() const {
+ return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<float, 8> boolvec<float, 8>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<float, 8> boolvec<float, 8>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<float, 8> boolvec<float, 8>::ifthen(realvec_t x,
+ realvec_t y) const {
+ return _mm256_blendv_ps(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline intvec<float, 8> intvec<float, 8>::abs() const {
#ifdef __AVX2__
- return _mm256_abs_epi32(v);
+ return _mm256_abs_epi32(v);
#else
- return MF::vml_abs(*this);
+ return MF::vml_abs(*this);
#endif
- }
-
- inline realvec<float,8> intvec<float,8>::as_float() const
- {
- return _mm256_castsi256_ps(v);
- }
-
- inline intvec<float,8> intvec<float,8>::bitifthen(intvec_t x,
- intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
- inline intvec<float,8> intvec<float,8>::clz() const
- {
- return MF::vml_clz(*this);
- }
-
- inline realvec<float,8> intvec<float,8>::convert_float() const
- {
- return _mm256_cvtepi32_ps(v);
- }
-
- inline intvec<float,8> intvec<float,8>::max(intvec_t x) const
- {
- return MF::vml_max(*this, x);
- }
-
- inline intvec<float,8> intvec<float,8>::min(intvec_t x) const
- {
- return MF::vml_min(*this, x);
- }
-
- inline intvec<float,8> intvec<float,8>::popcount() const
- {
- return MF::vml_popcount(*this);
- }
-
- inline intvec<float,8> intvec<float,8>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
- inline intvec<float,8> intvec<float,8>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
+}
+
+inline realvec<float, 8> intvec<float, 8>::as_float() const {
+ return _mm256_castsi256_ps(v);
+}
+
+inline intvec<float, 8> intvec<float, 8>::bitifthen(intvec_t x,
+ intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<float, 8> intvec<float, 8>::clz() const {
+ return MF::vml_clz(*this);
+}
+
+inline realvec<float, 8> intvec<float, 8>::convert_float() const {
+ return _mm256_cvtepi32_ps(v);
+}
+
+inline intvec<float, 8> intvec<float, 8>::max(intvec_t x) const {
+ return MF::vml_max(*this, x);
+}
+
+inline intvec<float, 8> intvec<float, 8>::min(intvec_t x) const {
+ return MF::vml_min(*this, x);
+}
+
+inline intvec<float, 8> intvec<float, 8>::popcount() const {
+ return MF::vml_popcount(*this);
+}
+
+inline intvec<float, 8> intvec<float, 8>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 8> intvec<float, 8>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_AVX_FLOAT8_H
+#endif // #ifndef VEC_AVX_FLOAT8_H
diff --git a/vec_avx_fp16_16.h b/vec_avx_fp16_16.h
index ddade85..6af27e5 100644
--- a/vec_avx_fp16_16.h
+++ b/vec_avx_fp16_16.h
@@ -12,378 +12,309 @@
// AVX intrinsics
#include <immintrin.h>
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_FP16_16
- template<> struct boolvec<fp16,16>;
- template<> struct intvec<fp16,16>;
- template<> struct realvec<fp16,16>;
-
-
-
- template<>
- struct boolvec<fp16,16>: floatprops<fp16>
- {
- static int const size = 16;
- typedef bool scalar_t;
- typedef __m256i bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- private:
- // true values have the sign bit set, false values have it unset
- static uint_t from_bool(bool a) { return - uint_t(a); }
- static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
- public:
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(boolvec const& x): v(x.v) {}
- // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a): v(_mm256_set1_epi16(from_bool(a))) {}
- boolvec(bool const* as):
- v(_mm256_set_epi16(from_bool(as[15]),
- from_bool(as[14]),
- from_bool(as[13]),
- from_bool(as[12]),
- from_bool(as[11]),
- from_bool(as[10]),
- from_bool(as[ 9]),
- from_bool(as[ 8]),
- from_bool(as[ 7]),
- from_bool(as[ 6]),
- from_bool(as[ 5]),
- from_bool(as[ 4]),
- from_bool(as[ 3]),
- from_bool(as[ 2]),
- from_bool(as[ 1]),
- from_bool(as[ 0]))) {}
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const
- {
- return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
- }
- boolvec& set_elt(int n, bool a)
- {
- return
- vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
- }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec operator!() const { return *this != boolvec(true); }
-
- boolvec operator&&(boolvec x) const
- {
- return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
- }
- boolvec operator||(boolvec x) const
- {
- return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
- }
- boolvec operator==(boolvec x) const { return !(*this!=x); }
- boolvec operator!=(boolvec x) const
- {
- return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
- }
-
- bool all() const
- {
- bool r = (*this)[0];
- for (int n=1; n<size; ++n) r = r && (*this)[n];
- return r;
- }
- bool any() const
- {
- bool r = (*this)[0];;
- for (int n=1; n<size; ++n) r = r || (*this)[n];
- return r;
- }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<fp16,16>: floatprops<fp16>
- {
- static int const size = 16;
- typedef int_t scalar_t;
- typedef __m256i ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(intvec const& x): v(x.v) {}
- // intvec& operator=(intvec const& x) { return v=x.v, *this; }
- intvec(ivector_t x): v(x) {}
- intvec(int_t a): v(_mm256_set1_epi16(a)) {}
- intvec(int_t const* as):
- v(_mm256_set_epi16(as[15],
- as[14],
- as[13],
- as[12],
- as[11],
- as[10],
- as[ 9],
- as[ 8],
- as[ 7],
- as[ 6],
- as[ 5],
- as[ 4],
- as[ 3],
- as[ 2],
- as[ 1],
- as[ 0])) {}
- static intvec iota()
- {
- return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8,
- 7, 6, 5, 4, 3, 2, 1, 0);
- }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const
- {
- return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
- }
- intvec_t& set_elt(int n, int_t a)
- {
- return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
- }
-
-
-
- boolvec_t as_bool() const { return v; }
- boolvec_t convert_bool() const
- {
- // Result: convert_bool(0)=false, convert_bool(else)=true
- // There is no intrinsic to compare to zero. Instead, we check
- // whether x is positive and x-1 is negative.
- intvec x = *this;
- // We know that boolvec values depend only on the sign bit
- // return (~(x-1) | x).as_bool();
- // return x.as_bool() || !(x-1).as_bool();
- return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
- }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- // Note: not all arithmetic operations are supported!
-
- intvec operator+() const { return *this; }
- intvec operator-() const { return IV(I(0)) - *this; }
-
- intvec operator+(intvec x) const
- {
+template <> struct boolvec<fp16, 16>;
+template <> struct intvec<fp16, 16>;
+template <> struct realvec<fp16, 16>;
+
+template <> struct boolvec<fp16, 16> : floatprops<fp16> {
+ static int const size = 16;
+ typedef bool scalar_t;
+ typedef __m256i bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+private:
+ // true values have the sign bit set, false values have it unset
+ static uint_t from_bool(bool a) { return -uint_t(a); }
+ static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v(_mm256_set1_epi16(from_bool(a))) {}
+ boolvec(bool const *as)
+ : v(_mm256_set_epi16(from_bool(as[15]), from_bool(as[14]),
+ from_bool(as[13]), from_bool(as[12]),
+ from_bool(as[11]), from_bool(as[10]),
+ from_bool(as[9]), from_bool(as[8]), from_bool(as[7]),
+ from_bool(as[6]), from_bool(as[5]), from_bool(as[4]),
+ from_bool(as[3]), from_bool(as[2]), from_bool(as[1]),
+ from_bool(as[0]))) {}
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const {
+ return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+ }
+ boolvec &set_elt(int n, bool a) {
+ return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+ *this;
+ }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec operator!() const { return *this != boolvec(true); }
+
+ boolvec operator&&(boolvec x) const {
+ return _mm256_castps_si256(
+ _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+ }
+ boolvec operator||(boolvec x) const {
+ return _mm256_castps_si256(
+ _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+ }
+ boolvec operator==(boolvec x) const { return !(*this != x); }
+ boolvec operator!=(boolvec x) const {
+ return _mm256_castps_si256(
+ _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+ }
+
+ bool all() const {
+ bool r = (*this)[0];
+ for (int n = 1; n < size; ++n)
+ r = r && (*this)[n];
+ return r;
+ }
+ bool any() const {
+ bool r = (*this)[0];
+ ;
+ for (int n = 1; n < size; ++n)
+ r = r || (*this)[n];
+ return r;
+ }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<fp16, 16> : floatprops<fp16> {
+ static int const size = 16;
+ typedef int_t scalar_t;
+ typedef __m256i ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x) : v(x) {}
+ intvec(int_t a) : v(_mm256_set1_epi16(a)) {}
+ intvec(int_t const *as)
+ : v(_mm256_set_epi16(as[15], as[14], as[13], as[12], as[11], as[10],
+ as[9], as[8], as[7], as[6], as[5], as[4], as[3],
+ as[2], as[1], as[0])) {}
+ static intvec iota() {
+ return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+ 0);
+ }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const {
+ return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+ }
+ intvec_t &set_elt(int n, int_t a) {
+ return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+ }
+
+ boolvec_t as_bool() const { return v; }
+ boolvec_t convert_bool() const {
+ // Result: convert_bool(0)=false, convert_bool(else)=true
+ // There is no intrinsic to compare to zero. Instead, we check
+ // whether x is positive and x-1 is negative.
+ intvec x = *this;
+ // We know that boolvec values depend only on the sign bit
+ // return (~(x-1) | x).as_bool();
+ // return x.as_bool() || !(x-1).as_bool();
+ return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+ }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ // Note: not all arithmetic operations are supported!
+
+ intvec operator+() const { return *this; }
+ intvec operator-() const { return IV(I(0)) - *this; }
+
+ intvec operator+(intvec x) const {
#ifdef __AVX2__
- return _mm256_add_epi16(v, x.v);
+ return _mm256_add_epi16(v, x.v);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- __m128i xvlo = _mm256_castsi256_si128(x.v);
- __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
- vlo = _mm_add_epi16(vlo, xvlo);
- vhi = _mm_add_epi16(vhi, xvhi);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ __m128i xvlo = _mm256_castsi256_si128(x.v);
+ __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+ vlo = _mm_add_epi16(vlo, xvlo);
+ vhi = _mm_add_epi16(vhi, xvhi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec operator-(intvec x) const
- {
+ }
+ intvec operator-(intvec x) const {
#ifdef __AVX2__
- return _mm256_sub_epi16(v, x.v);
+ return _mm256_sub_epi16(v, x.v);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- __m128i xvlo = _mm256_castsi256_si128(x.v);
- __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
- vlo = _mm_sub_epi16(vlo, xvlo);
- vhi = _mm_sub_epi16(vhi, xvhi);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ __m128i xvlo = _mm256_castsi256_si128(x.v);
+ __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+ vlo = _mm_sub_epi16(vlo, xvlo);
+ vhi = _mm_sub_epi16(vhi, xvhi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
-
- intvec& operator+=(intvec const& x) { return *this=*this+x; }
- intvec& operator-=(intvec const& x) { return *this=*this-x; }
-
-
-
- intvec operator~() const { return IV(~U(0)) ^ *this; }
-
- intvec operator&(intvec x) const
- {
+ }
+
+ intvec &operator+=(intvec const &x) { return *this = *this + x; }
+ intvec &operator-=(intvec const &x) { return *this = *this - x; }
+
+ intvec operator~() const { return IV(~U(0)) ^ *this; }
+
+ intvec operator&(intvec x) const {
#ifdef __AVX2__
- return _mm256_and_si256(v, x.v);
+ return _mm256_and_si256(v, x.v);
#else
- return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
+ return _mm256_castps_si256(
+ _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
#endif
- }
- intvec operator|(intvec x) const
- {
+ }
+ intvec operator|(intvec x) const {
#ifdef __AVX2__
- return _mm256_or_si256(v, x.v);
+ return _mm256_or_si256(v, x.v);
#else
- return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
+ return _mm256_castps_si256(
+ _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
#endif
- }
- intvec operator^(intvec x) const
- {
+ }
+ intvec operator^(intvec x) const {
#ifdef __AVX2__
- return _mm256_xor_si256(v, x.v);
+ return _mm256_xor_si256(v, x.v);
#else
- return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
+ return _mm256_castps_si256(
+ _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
#endif
- }
-
- intvec& operator&=(intvec const& x) { return *this=*this&x; }
- intvec& operator|=(intvec const& x) { return *this=*this|x; }
- intvec& operator^=(intvec const& x) { return *this=*this^x; }
-
-
-
- intvec lsr(int_t n) const
- {
+ }
+
+ intvec &operator&=(intvec const &x) { return *this = *this & x; }
+ intvec &operator|=(intvec const &x) { return *this = *this | x; }
+ intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+ intvec lsr(int_t n) const {
#ifdef __AVX2__
- return _mm256_srli_epi16(v, n);
+ return _mm256_srli_epi16(v, n);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- vlo = _mm_srli_epi16(vlo, n);
- vhi = _mm_srli_epi16(vhi, n);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ vlo = _mm_srli_epi16(vlo, n);
+ vhi = _mm_srli_epi16(vhi, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec operator>>(int_t n) const
- {
+ }
+ intvec operator>>(int_t n) const {
#ifdef __AVX2__
- return _mm256_srai_epi16(v, n);
+ return _mm256_srai_epi16(v, n);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- vlo = _mm_srai_epi16(vlo, n);
- vhi = _mm_srai_epi16(vhi, n);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ vlo = _mm_srai_epi16(vlo, n);
+ vhi = _mm_srai_epi16(vhi, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec operator<<(int_t n) const
- {
+ }
+ intvec operator<<(int_t n) const {
#ifdef __AVX2__
- return _mm256_slli_epi16(v, n);
+ return _mm256_slli_epi16(v, n);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- vlo = _mm_slli_epi16(vlo, n);
- vhi = _mm_slli_epi16(vhi, n);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ vlo = _mm_slli_epi16(vlo, n);
+ vhi = _mm_slli_epi16(vhi, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec& operator>>=(int_t n) { return *this=*this>>n; }
- intvec& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec lsr(intvec n) const
- {
+ }
+ intvec &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec lsr(intvec n) const {
#ifdef __AVX2__
- // TODO: Use permute instead of shift/mask?
- _mm256i mlo = _mm256_set1_epi32(U(0x0000ffff));
- _mm256i vlo = _mm256_and_si256(mlo, v);
- _mm256i vhi = v;
- _mm256i clo = _mm256_and_si256(mlo, n);
- _mm256i chi = _mm256_and_si256(mlo, _mm256_srli_epi32(n, 16));
- _mm256i rlo = _mm256_srlv_epi32(vlo, clo);
- _mm256i rhi = _mm256_andnot_si256(mlo, _mm256_srlv_epi32(vhi, chi));
- return _mm256_or_si256(rhi, rlo);
+ // TODO: Use permute instead of shift/mask?
+ _mm256i mlo = _mm256_set1_epi32(U(0x0000ffff));
+ _mm256i vlo = _mm256_and_si256(mlo, v);
+ _mm256i vhi = v;
+ _mm256i clo = _mm256_and_si256(mlo, n);
+ _mm256i chi = _mm256_and_si256(mlo, _mm256_srli_epi32(n, 16));
+ _mm256i rlo = _mm256_srlv_epi32(vlo, clo);
+ _mm256i rhi = _mm256_andnot_si256(mlo, _mm256_srlv_epi32(vhi, chi));
+ return _mm256_or_si256(rhi, rlo);
#else
- intvec r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, U((*this)[i]) >> U(n[i]));
- }
- return r;
-#endif
+ intvec r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, U((*this)[i]) >> U(n[i]));
}
- intvec operator>>(intvec n) const
- {
+ return r;
+#endif
+ }
+ intvec operator>>(intvec n) const {
#ifdef __AVX2__
- intvec_t offset = U(1) << (bits-1);
- return (*this + offset).lsr(n) - offset.lsr(n);
+ intvec_t offset = U(1) << (bits - 1);
+ return (*this + offset).lsr(n) - offset.lsr(n);
#else
- intvec r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] >> n[i]);
- }
- return r;
-#endif
+ intvec r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] >> n[i]);
}
- intvec operator<<(intvec n) const
- {
+ return r;
+#endif
+ }
+ intvec operator<<(intvec n) const {
#ifdef __AVX2__
- // TODO: Use permute instead of shift/mask?
- _mm256i mlo = _mm256_set1_epi32(U(0x0000ffff));
- _mm256i vlo = v;
+ // TODO: Use permute instead of shift/mask?
+ _mm256i mlo = _mm256_set1_epi32(U(0x0000ffff));
+ _mm256i vlo = v;
_mm256i vhi = _mm256_andnot_si256(mlo, v;
_mm256i clo = _mm256_and_si256(mlo, n);
_mm256i chi = _mm256_and_si256(mlo, _mm256_srli_epi32(n, 16));
@@ -391,338 +322,274 @@ namespace vecmathlib {
_mm256i rhi = _mm256_sllv_epi32(vhi, chi);
return _mm256_or_si256(rhi, rlo);
#else
- intvec r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] << n[i]);
- }
- return r;
-#endif
+ intvec r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] << n[i]);
}
- intvec& operator>>=(intvec n) { return *this=*this>>n; }
- intvec& operator<<=(intvec n) { return *this=*this<<n; }
-
-
-
- boolvec_t operator==(intvec const& x) const
- {
+ return r;
+#endif
+ }
+ intvec &operator>>=(intvec n) { return *this = *this >> n; }
+ intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+ boolvec_t operator==(intvec const &x) const {
#ifdef __AVX2__
- return _mm256_cmpeq_epi16(v, x.v);
+ return _mm256_cmpeq_epi16(v, x.v);
#else
- return ! (*this != x);
+ return !(*this != x);
#endif
- }
- boolvec_t operator!=(intvec const& x) const
- {
+ }
+ boolvec_t operator!=(intvec const &x) const {
#ifdef __AVX2__
- return ! (*this == x);
+ return !(*this == x);
#else
- return (*this ^ x).convert_bool();
+ return (*this ^ x).convert_bool();
#endif
- }
- boolvec_t operator<(intvec const& x) const
- {
+ }
+ boolvec_t operator<(intvec const &x) const {
#ifdef __AVX2__
- return _mm256_cmpgt_epi16(x.v, v);
+ return _mm256_cmpgt_epi16(x.v, v);
#else
- // TODO: First compare sign; then if equal, compare sign of difference
- // TODO: Also look for intrinsics
- boolvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] < x[i]);
- }
- return r;
-#endif
- }
- boolvec_t operator<=(intvec_t const& x) const
- {
- return ! (*this > x);
- }
- boolvec_t operator>(intvec_t const& x) const
- {
- return x < *this;
+ // TODO: First compare sign; then if equal, compare sign of difference
+ // TODO: Also look for intrinsics
+ boolvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] < x[i]);
}
- boolvec_t operator>=(intvec_t const& x) const
- {
- return ! (*this < x);
- }
-
- intvec_t abs() const;
- boolvec_t isignbit() const { return as_bool(); }
- intvec_t max(intvec_t x) const;
- intvec_t min(intvec_t x) const;
- };
-
-
-
- template<>
- struct realvec<fp16,16>: floatprops<fp16>
- {
- static int const size = 16;
- typedef real_t scalar_t;
- typedef __m256i vector_t;
- static int const alignment = sizeof(vector_t);
-
- static char const* name() {
+ return r;
+#endif
+ }
+ boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+ boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+ boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+ intvec_t abs() const;
+ boolvec_t isignbit() const { return as_bool(); }
+ intvec_t max(intvec_t x) const;
+ intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<fp16, 16> : floatprops<fp16> {
+ static int const size = 16;
+ typedef real_t scalar_t;
+ typedef __m256i vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static char const *name() {
#ifdef __AVX2__
- return "<AVX2:16*fp16>";
+ return "<AVX2:16*fp16>";
#else
- return "<AVX:16*fp16>";
+ return "<AVX:16*fp16>";
#endif
+ }
+ void barrier() { __asm__("" : "+x"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x) : v(x) {}
+ realvec(real_t a) : v(_mm256_set1_epi16(FP::as_int(a))) {}
+ realvec(real_t const *as)
+ : v(_mm256_set_epi16(
+ FP::as_int(as[15]), FP::as_int(as[14]), FP::as_int(as[13]),
+ FP::as_int(as[12]), FP::as_int(as[11]), FP::as_int(as[10]),
+ FP::as_int(as[9]), FP::as_int(as[8]), FP::as_int(as[7]),
+ FP::as_int(as[6]), FP::as_int(as[5]), FP::as_int(as[4]),
+ FP::as_int(as[3]), FP::as_int(as[2]), FP::as_int(as[1]),
+ FP::as_int(as[0]))) {}
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const {
+ return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+ }
+ realvec_t &set_elt(int n, real_t a) {
+ return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+ }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return _mm256_load_si256((__m256i const *)p);
+ }
+ static realvec_t loadu(real_t const *p) {
+ return _mm256_loadu_si256((__m256i const *)p);
+ }
+ static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
}
- void barrier() { __asm__("": "+x"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(realvec const& x): v(x.v) {}
- // realvec& operator=(realvec const& x) { return v=x.v, *this; }
- realvec(vector_t x): v(x) {}
- realvec(real_t a): v(_mm256_set1_epi16(FP::as_int(a))) {}
- realvec(real_t const* as):
- v(_mm256_set_epi16(FP::as_int(as[15]),
- FP::as_int(as[14]),
- FP::as_int(as[13]),
- FP::as_int(as[12]),
- FP::as_int(as[11]),
- FP::as_int(as[10]),
- FP::as_int(as[ 9]),
- FP::as_int(as[ 8]),
- FP::as_int(as[ 7]),
- FP::as_int(as[ 6]),
- FP::as_int(as[ 5]),
- FP::as_int(as[ 4]),
- FP::as_int(as[ 3]),
- FP::as_int(as[ 2]),
- FP::as_int(as[ 1]),
- FP::as_int(as[ 0]))) {}
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const
- {
- return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
- }
- realvec_t& set_elt(int n, real_t a)
- {
- return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
- }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return _mm256_load_si256((__m256i const*)p);
- }
- static realvec_t loadu(real_t const* p)
- {
- return _mm256_loadu_si256((__m256i const*)p);
- }
- static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff);
- return loadu(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return m.m.ifthen(loada(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return m.m.ifthen(loadu(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff, m);
- return loadu(p+ioff, m);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- _mm256_store_si256((__m256i*)p, v);
- }
- void storeu(real_t* p) const
- {
- return _mm256_storeu_si256((__m256i*)p, v);
- }
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff);
- storeu(p+ioff);
- }
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- } else {
- // TODO: this is expensive
- for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
- }
- }
- void storeu(real_t* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- } else {
- // TODO: this is expensive
- for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
- }
- }
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff, m);
- storeu(p+ioff, m);
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
}
-
-
-
- intvec_t as_int() const { return v; }
- intvec_t convert_int() const { __builtin_unreachable(); }
-
-
-
- realvec operator+() const { __builtin_unreachable(); }
- realvec operator-() const { __builtin_unreachable(); }
-
- realvec operator+(realvec x) const { __builtin_unreachable(); }
- realvec operator-(realvec x) const { __builtin_unreachable(); }
- realvec operator*(realvec x) const { __builtin_unreachable(); }
- realvec operator/(realvec x) const { __builtin_unreachable(); }
-
- realvec& operator+=(realvec const& x) { return *this=*this+x; }
- realvec& operator-=(realvec const& x) { return *this=*this-x; }
- realvec& operator*=(realvec const& x) { return *this=*this*x; }
- realvec& operator/=(realvec const& x) { return *this=*this/x; }
-
- real_t maxval() const { __builtin_unreachable(); }
- real_t minval() const { __builtin_unreachable(); }
- real_t prod() const { __builtin_unreachable(); }
- real_t sum() const { __builtin_unreachable(); }
-
-
-
- boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
-
-
-
- realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
- realvec fabs() const { return MF::vml_fabs(*this); }
- intvec_t ilogb() const { return MF::vml_ilogb(*this); }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const { return MF::vml_isnan(*this); }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
- realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
- boolvec_t signbit() const { return v; }
- };
-
-
-
- // boolvec definitions
-
- inline intvec<fp16,16> boolvec<fp16,16>::as_int() const
- {
- return v;
- }
-
- inline intvec<fp16,16> boolvec<fp16,16>::convert_int() const
- {
- return lsr(as_int(), bits-1);
- }
-
- inline
- boolvec<fp16,16> boolvec<fp16,16>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return ifthen(x.as_int(), y.as_int()).as_bool();
- }
-
- inline intvec<fp16,16> boolvec<fp16,16>::ifthen(intvec_t x, intvec_t y) const
- {
- return (( -convert_int() & x) | (~-convert_int() & y));
- }
-
- inline
- realvec<fp16,16> boolvec<fp16,16>::ifthen(realvec_t x, realvec_t y) const
- {
- return ifthen(x.as_int(), y.as_int()).as_float();
- }
-
-
-
- // intvec definitions
-
- inline intvec<fp16,16> intvec<fp16,16>::abs() const
- {
-#ifdef __AVX2__
- return _mm256_abs_epi16(v);
-#else
- return MF::vml_abs(*this);
-#endif
}
-
- inline realvec<fp16,16> intvec<fp16,16>::as_float() const
- {
- return v;
+ realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff, m);
+ return loadu(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ _mm256_store_si256((__m256i *)p, v);
}
-
- inline realvec<fp16,16> intvec<fp16,16>::convert_float() const
- {
- __builtin_unreachable();
+ void storeu(real_t *p) const { return _mm256_storeu_si256((__m256i *)p, v); }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ // TODO: this is expensive
+ for (int n = 0; n < size; ++n)
+ if (m.m[n])
+ p[n] = (*this)[n];
+ }
}
-
- inline intvec<fp16,16> intvec<fp16,16>::max(intvec_t x) const
- {
- return MF::vml_max(*this, x);
+ void storeu(real_t *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ // TODO: this is expensive
+ for (int n = 0; n < size; ++n)
+ if (m.m[n])
+ p[n] = (*this)[n];
+ }
}
-
- inline intvec<fp16,16> intvec<fp16,16>::min(intvec_t x) const
- {
- return MF::vml_min(*this, x);
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff, m);
+ storeu(p + ioff, m);
}
-
+
+ intvec_t as_int() const { return v; }
+ intvec_t convert_int() const { __builtin_unreachable(); }
+
+ realvec operator+() const { __builtin_unreachable(); }
+ realvec operator-() const { __builtin_unreachable(); }
+
+ realvec operator+(realvec x) const { __builtin_unreachable(); }
+ realvec operator-(realvec x) const { __builtin_unreachable(); }
+ realvec operator*(realvec x) const { __builtin_unreachable(); }
+ realvec operator/(realvec x) const { __builtin_unreachable(); }
+
+ realvec &operator+=(realvec const &x) { return *this = *this + x; }
+ realvec &operator-=(realvec const &x) { return *this = *this - x; }
+ realvec &operator*=(realvec const &x) { return *this = *this * x; }
+ realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+ real_t maxval() const { __builtin_unreachable(); }
+ real_t minval() const { __builtin_unreachable(); }
+ real_t prod() const { __builtin_unreachable(); }
+ real_t sum() const { __builtin_unreachable(); }
+
+ boolvec_t operator==(realvec const &x) const { __builtin_unreachable(); }
+ boolvec_t operator!=(realvec const &x) const { __builtin_unreachable(); }
+ boolvec_t operator<(realvec const &x) const { __builtin_unreachable(); }
+ boolvec_t operator<=(realvec const &x) const { __builtin_unreachable(); }
+ boolvec_t operator>(realvec const &x) const { __builtin_unreachable(); }
+ boolvec_t operator>=(realvec const &x) const { __builtin_unreachable(); }
+
+ realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+ realvec fabs() const { return MF::vml_fabs(*this); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const { return MF::vml_isnan(*this); }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ boolvec_t signbit() const { return v; }
+};
+
+// boolvec definitions
+
+inline intvec<fp16, 16> boolvec<fp16, 16>::as_int() const { return v; }
+
+inline intvec<fp16, 16> boolvec<fp16, 16>::convert_int() const {
+ return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<fp16, 16> boolvec<fp16, 16>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<fp16, 16> boolvec<fp16, 16>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return ((-convert_int() & x) | (~ - convert_int() & y));
+}
+
+inline realvec<fp16, 16> boolvec<fp16, 16>::ifthen(realvec_t x,
+ realvec_t y) const {
+ return ifthen(x.as_int(), y.as_int()).as_float();
+}
+
+// intvec definitions
+
+inline intvec<fp16, 16> intvec<fp16, 16>::abs() const {
+#ifdef __AVX2__
+ return _mm256_abs_epi16(v);
+#else
+ return MF::vml_abs(*this);
+#endif
+}
+
+inline realvec<fp16, 16> intvec<fp16, 16>::as_float() const { return v; }
+
+inline realvec<fp16, 16> intvec<fp16, 16>::convert_float() const {
+ __builtin_unreachable();
+}
+
+inline intvec<fp16, 16> intvec<fp16, 16>::max(intvec_t x) const {
+ return MF::vml_max(*this, x);
+}
+
+inline intvec<fp16, 16> intvec<fp16, 16>::min(intvec_t x) const {
+ return MF::vml_min(*this, x);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_AVX_FP16_16_H
+#endif // #ifndef VEC_AVX_FP16_16_H
diff --git a/vec_avx_fp8_32.h b/vec_avx_fp8_32.h
index 912bd19..0ae79e7 100644
--- a/vec_avx_fp8_32.h
+++ b/vec_avx_fp8_32.h
@@ -12,763 +12,592 @@
// AVX intrinsics
#include <immintrin.h>
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_FP8_32
- template<> struct boolvec<fp8,32>;
- template<> struct intvec<fp8,32>;
- template<> struct realvec<fp8,32>;
-
-
-
- template<>
- struct boolvec<fp8,32>: floatprops<fp8>
- {
- static int const size = 32;
- typedef bool scalar_t;
- typedef __m256i bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- private:
- // true values have the sign bit set, false values have it unset
- static uint_t from_bool(bool a) { return - uint_t(a); }
- static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
- public:
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(boolvec const& x): v(x.v) {}
- // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a): v(_mm256_set1_epi8(from_bool(a))) {}
- boolvec(bool const* as):
- v(_mm256_set_epi8(from_bool(as[31]),
- from_bool(as[30]),
- from_bool(as[29]),
- from_bool(as[28]),
- from_bool(as[27]),
- from_bool(as[26]),
- from_bool(as[25]),
- from_bool(as[24]),
- from_bool(as[23]),
- from_bool(as[22]),
- from_bool(as[21]),
- from_bool(as[20]),
- from_bool(as[19]),
- from_bool(as[18]),
- from_bool(as[17]),
- from_bool(as[16]),
- from_bool(as[15]),
- from_bool(as[14]),
- from_bool(as[13]),
- from_bool(as[12]),
- from_bool(as[11]),
- from_bool(as[10]),
- from_bool(as[ 9]),
- from_bool(as[ 8]),
- from_bool(as[ 7]),
- from_bool(as[ 6]),
- from_bool(as[ 5]),
- from_bool(as[ 4]),
- from_bool(as[ 3]),
- from_bool(as[ 2]),
- from_bool(as[ 1]),
- from_bool(as[ 0]))) {}
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const
- {
- return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
- }
- boolvec& set_elt(int n, bool a)
- {
- return
- vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
- }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec operator!() const { return *this != boolvec(true); }
-
- boolvec operator&&(boolvec x) const
- {
- return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
- }
- boolvec operator||(boolvec x) const
- {
- return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
- }
- boolvec operator==(boolvec x) const { return !(*this!=x); }
- boolvec operator!=(boolvec x) const
- {
- return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
- }
-
- bool all() const
- {
- bool r = (*this)[0];
- for (int n=1; n<size; ++n) r = r && (*this)[n];
- return r;
- }
- bool any() const
- {
- bool r = (*this)[0];;
- for (int n=1; n<size; ++n) r = r || (*this)[n];
- return r;
- }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<fp8,32>: floatprops<fp8>
- {
- static int const size = 32;
- typedef int_t scalar_t;
- typedef __m256i ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(intvec const& x): v(x.v) {}
- // intvec& operator=(intvec const& x) { return v=x.v, *this; }
- intvec(ivector_t x): v(x) {}
- intvec(int_t a): v(_mm256_set1_epi8(a)) {}
- intvec(int_t const* as):
- v(_mm256_set_epi8(as[31],
- as[30],
- as[29],
- as[28],
- as[27],
- as[26],
- as[25],
- as[24],
- as[23],
- as[22],
- as[21],
- as[20],
- as[19],
- as[18],
- as[17],
- as[16],
- as[15],
- as[14],
- as[13],
- as[12],
- as[11],
- as[10],
- as[ 9],
- as[ 8],
- as[ 7],
- as[ 6],
- as[ 5],
- as[ 4],
- as[ 3],
- as[ 2],
- as[ 1],
- as[ 0])) {}
- static intvec iota()
- {
- return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24,
- 23, 22, 21, 20, 19, 18, 17, 16,
- 15, 14, 13, 12, 11, 10, 9, 8,
- 7, 6, 5, 4, 3, 2, 1, 0);
- }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const
- {
- return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
- }
- intvec_t& set_elt(int n, int_t a)
- {
- return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
- }
-
-
-
- boolvec_t as_bool() const { return v; }
- boolvec_t convert_bool() const
- {
- // Result: convert_bool(0)=false, convert_bool(else)=true
- // There is no intrinsic to compare to zero. Instead, we check
- // whether x is positive and x-1 is negative.
- intvec x = *this;
- // We know that boolvec values depend only on the sign bit
- // return (~(x-1) | x).as_bool();
- // return x.as_bool() || !(x-1).as_bool();
- return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
- }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- // Note: not all arithmetic operations are supported!
-
- intvec operator+() const { return *this; }
- intvec operator-() const { return IV(I(0)) - *this; }
-
- intvec operator+(intvec x) const
- {
+template <> struct boolvec<fp8, 32>;
+template <> struct intvec<fp8, 32>;
+template <> struct realvec<fp8, 32>;
+
+template <> struct boolvec<fp8, 32> : floatprops<fp8> {
+ static int const size = 32;
+ typedef bool scalar_t;
+ typedef __m256i bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+private:
+ // true values have the sign bit set, false values have it unset
+ static uint_t from_bool(bool a) { return -uint_t(a); }
+ static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v(_mm256_set1_epi8(from_bool(a))) {}
+ boolvec(bool const *as)
+ : v(_mm256_set_epi8(
+ from_bool(as[31]), from_bool(as[30]), from_bool(as[29]),
+ from_bool(as[28]), from_bool(as[27]), from_bool(as[26]),
+ from_bool(as[25]), from_bool(as[24]), from_bool(as[23]),
+ from_bool(as[22]), from_bool(as[21]), from_bool(as[20]),
+ from_bool(as[19]), from_bool(as[18]), from_bool(as[17]),
+ from_bool(as[16]), from_bool(as[15]), from_bool(as[14]),
+ from_bool(as[13]), from_bool(as[12]), from_bool(as[11]),
+ from_bool(as[10]), from_bool(as[9]), from_bool(as[8]),
+ from_bool(as[7]), from_bool(as[6]), from_bool(as[5]),
+ from_bool(as[4]), from_bool(as[3]), from_bool(as[2]),
+ from_bool(as[1]), from_bool(as[0]))) {}
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const {
+ return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+ }
+ boolvec &set_elt(int n, bool a) {
+ return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+ *this;
+ }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec operator!() const { return *this != boolvec(true); }
+
+ boolvec operator&&(boolvec x) const {
+ return _mm256_castps_si256(
+ _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+ }
+ boolvec operator||(boolvec x) const {
+ return _mm256_castps_si256(
+ _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+ }
+ boolvec operator==(boolvec x) const { return !(*this != x); }
+ boolvec operator!=(boolvec x) const {
+ return _mm256_castps_si256(
+ _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
+ }
+
+ bool all() const {
+ bool r = (*this)[0];
+ for (int n = 1; n < size; ++n)
+ r = r && (*this)[n];
+ return r;
+ }
+ bool any() const {
+ bool r = (*this)[0];
+ ;
+ for (int n = 1; n < size; ++n)
+ r = r || (*this)[n];
+ return r;
+ }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<fp8, 32> : floatprops<fp8> {
+ static int const size = 32;
+ typedef int_t scalar_t;
+ typedef __m256i ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x) : v(x) {}
+ intvec(int_t a) : v(_mm256_set1_epi8(a)) {}
+ intvec(int_t const *as)
+ : v(_mm256_set_epi8(as[31], as[30], as[29], as[28], as[27], as[26],
+ as[25], as[24], as[23], as[22], as[21], as[20],
+ as[19], as[18], as[17], as[16], as[15], as[14],
+ as[13], as[12], as[11], as[10], as[9], as[8], as[7],
+ as[6], as[5], as[4], as[3], as[2], as[1], as[0])) {}
+ static intvec iota() {
+ return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19,
+ 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4,
+ 3, 2, 1, 0);
+ }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const {
+ return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+ }
+ intvec_t &set_elt(int n, int_t a) {
+ return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+ }
+
+ boolvec_t as_bool() const { return v; }
+ boolvec_t convert_bool() const {
+ // Result: convert_bool(0)=false, convert_bool(else)=true
+ // There is no intrinsic to compare to zero. Instead, we check
+ // whether x is positive and x-1 is negative.
+ intvec x = *this;
+ // We know that boolvec values depend only on the sign bit
+ // return (~(x-1) | x).as_bool();
+ // return x.as_bool() || !(x-1).as_bool();
+ return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+ }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ // Note: not all arithmetic operations are supported!
+
+ intvec operator+() const { return *this; }
+ intvec operator-() const { return IV(I(0)) - *this; }
+
+ intvec operator+(intvec x) const {
#ifdef __AVX2__
- return _mm256_add_epi8(v, x.v);
+ return _mm256_add_epi8(v, x.v);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- __m128i xvlo = _mm256_castsi256_si128(x.v);
- __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
- vlo = _mm_add_epi8(vlo, xvlo);
- vhi = _mm_add_epi8(vhi, xvhi);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ __m128i xvlo = _mm256_castsi256_si128(x.v);
+ __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+ vlo = _mm_add_epi8(vlo, xvlo);
+ vhi = _mm_add_epi8(vhi, xvhi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec operator-(intvec x) const
- {
+ }
+ intvec operator-(intvec x) const {
#ifdef __AVX2__
- return _mm256_sub_epi8(v, x.v);
+ return _mm256_sub_epi8(v, x.v);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- __m128i xvlo = _mm256_castsi256_si128(x.v);
- __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
- vlo = _mm_sub_epi8(vlo, xvlo);
- vhi = _mm_sub_epi8(vhi, xvhi);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ __m128i xvlo = _mm256_castsi256_si128(x.v);
+ __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+ vlo = _mm_sub_epi8(vlo, xvlo);
+ vhi = _mm_sub_epi8(vhi, xvhi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
-
- intvec& operator+=(intvec const& x) { return *this=*this+x; }
- intvec& operator-=(intvec const& x) { return *this=*this-x; }
-
-
-
- intvec operator~() const { return IV(~U(0)) ^ *this; }
-
- intvec operator&(intvec x) const
- {
+ }
+
+ intvec &operator+=(intvec const &x) { return *this = *this + x; }
+ intvec &operator-=(intvec const &x) { return *this = *this - x; }
+
+ intvec operator~() const { return IV(~U(0)) ^ *this; }
+
+ intvec operator&(intvec x) const {
#ifdef __AVX2__
- return _mm256_and_si256(v, x.v);
+ return _mm256_and_si256(v, x.v);
#else
- return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
+ return _mm256_castps_si256(
+ _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
#endif
- }
- intvec operator|(intvec x) const
- {
+ }
+ intvec operator|(intvec x) const {
#ifdef __AVX2__
- return _mm256_or_si256(v, x.v);
+ return _mm256_or_si256(v, x.v);
#else
- return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
+ return _mm256_castps_si256(
+ _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
#endif
- }
- intvec operator^(intvec x) const
- {
+ }
+ intvec operator^(intvec x) const {
#ifdef __AVX2__
- return _mm256_xor_si256(v, x.v);
+ return _mm256_xor_si256(v, x.v);
#else
- return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
- _mm256_castsi256_ps(x.v)));
+ return _mm256_castps_si256(
+ _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v)));
#endif
- }
-
- intvec& operator&=(intvec const& x) { return *this=*this&x; }
- intvec& operator|=(intvec const& x) { return *this=*this|x; }
- intvec& operator^=(intvec const& x) { return *this=*this^x; }
-
-
-
- intvec lsr(int_t n) const
- {
+ }
+
+ intvec &operator&=(intvec const &x) { return *this = *this & x; }
+ intvec &operator|=(intvec const &x) { return *this = *this | x; }
+ intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+ intvec lsr(int_t n) const {
#ifdef __AVX2__
- uint_t masklo = U(0x00ffU) >> U(n);
- uint_t maskhi = U(0xff00U);
- intvec mask = masklo | maskhi;
- return intvec(_mm256_srai_epi16(v, n)) & mask;
+ uint_t masklo = U(0x00ffU) >> U(n);
+ uint_t maskhi = U(0xff00U);
+ intvec mask = masklo | maskhi;
+ return intvec(_mm256_srai_epi16(v, n)) & mask;
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- uint_t masklo = U(0x00ffU) >> U(n);
- uint_t maskhi = U(0xff00U);
- __m128i mask = _mm_set1_epi16(masklo | maskhi);
- vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask);
- vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ uint_t masklo = U(0x00ffU) >> U(n);
+ uint_t maskhi = U(0xff00U);
+ __m128i mask = _mm_set1_epi16(masklo | maskhi);
+ vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask);
+ vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec operator>>(int_t n) const
- {
+ }
+ intvec operator>>(int_t n) const {
#ifdef __AVX2__
- // There is no _mm256_srai_epi8. To emulate it, add 0x80 before
- // shifting, and subtract the shifted 0x80 after shifting
- intvec_t offset = U(1) << (bits-1);
- return (*this + offset).lsr(n) - offset.lsr(n);
+ // There is no _mm256_srai_epi8. To emulate it, add 0x80 before
+ // shifting, and subtract the shifted 0x80 after shifting
+ intvec_t offset = U(1) << (bits - 1);
+ return (*this + offset).lsr(n) - offset.lsr(n);
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- uint_t masklo = U(0x00ffU);
- uint_t maskhi = U(0xff00U);
- __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n+8),
- _mm_set1_epi16(masklo));
- __m128i vlohi = _mm_and_si128(_mm_srai_epi16(vlo, n),
- _mm_set1_epi16(maskhi));
- vlo = _mm_or_si128(vlolo, vlohi);
- __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n+8),
- _mm_set1_epi16(masklo));
- __m128i vhihi = _mm_and_si128(_mm_srai_epi16(vhi, n),
- _mm_set1_epi16(maskhi));
- vhi = _mm_or_si128(vhilo, vhihi);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ uint_t masklo = U(0x00ffU);
+ uint_t maskhi = U(0xff00U);
+ __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n + 8),
+ _mm_set1_epi16(masklo));
+ __m128i vlohi =
+ _mm_and_si128(_mm_srai_epi16(vlo, n), _mm_set1_epi16(maskhi));
+ vlo = _mm_or_si128(vlolo, vlohi);
+ __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n + 8),
+ _mm_set1_epi16(masklo));
+ __m128i vhihi =
+ _mm_and_si128(_mm_srai_epi16(vhi, n), _mm_set1_epi16(maskhi));
+ vhi = _mm_or_si128(vhilo, vhihi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
- }
- intvec operator<<(int_t n) const
- {
+ }
+ intvec operator<<(int_t n) const {
#ifdef __AVX2__
- uint_t masklo = U(0x00ffU);
- uint_t maskhi = U(0xff00U) << U(n);
- intvec mask = masklo | maskhi;
- return intvec(_mm256_slli_epi16(v, n)) & mask;
+ uint_t masklo = U(0x00ffU);
+ uint_t maskhi = U(0xff00U) << U(n);
+ intvec mask = masklo | maskhi;
+ return intvec(_mm256_slli_epi16(v, n)) & mask;
#else
- __m128i vlo = _mm256_castsi256_si128(v);
- __m128i vhi = _mm256_extractf128_si256(v, 1);
- uint_t masklo = U(0x00ffU);
- uint_t maskhi = U(0xff00U) << U(n);
- __m128i mask = _mm_set1_epi16(masklo | maskhi);
- vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask);
- vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask);
- return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ uint_t masklo = U(0x00ffU);
+ uint_t maskhi = U(0xff00U) << U(n);
+ __m128i mask = _mm_set1_epi16(masklo | maskhi);
+ vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask);
+ vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
#endif
+ }
+ intvec &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec lsr(intvec n) const {
+ intvec r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, U((*this)[i]) >> U(n[i]));
}
- intvec& operator>>=(int_t n) { return *this=*this>>n; }
- intvec& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec lsr(intvec n) const
- {
- intvec r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, U((*this)[i]) >> U(n[i]));
- }
- return r;
- }
- intvec operator>>(intvec n) const
- {
- intvec r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] >> n[i]);
- }
- return r;
+ return r;
+ }
+ intvec operator>>(intvec n) const {
+ intvec r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] >> n[i]);
}
- intvec operator<<(intvec n) const
- {
- intvec r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] << n[i]);
- }
- return r;
+ return r;
+ }
+ intvec operator<<(intvec n) const {
+ intvec r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] << n[i]);
}
- intvec& operator>>=(intvec n) { return *this=*this>>n; }
- intvec& operator<<=(intvec n) { return *this=*this<<n; }
-
-
-
- boolvec_t operator==(intvec const& x) const
- {
+ return r;
+ }
+ intvec &operator>>=(intvec n) { return *this = *this >> n; }
+ intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+ boolvec_t operator==(intvec const &x) const {
#ifdef __AVX2__
- return _mm256_cmpeq_epi8(v, x.v);
+ return _mm256_cmpeq_epi8(v, x.v);
#else
- return ! (*this != x);
+ return !(*this != x);
#endif
- }
- boolvec_t operator!=(intvec const& x) const
- {
+ }
+ boolvec_t operator!=(intvec const &x) const {
#ifdef __AVX2__
- return ! (*this == x);
+ return !(*this == x);
#else
- return (*this ^ x).convert_bool();
+ return (*this ^ x).convert_bool();
#endif
- }
- boolvec_t operator<(intvec const& x) const
- {
+ }
+ boolvec_t operator<(intvec const &x) const {
#ifdef __AVX2__
- return _mm256_cmpgt_epi8(x.v, v);
+ return _mm256_cmpgt_epi8(x.v, v);
#else
- // TODO: First compare sign; then if equal, compare sign of difference
- // TODO: Also look for intrinsics
- boolvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] < x[i]);
- }
- return r;
-#endif
- }
- boolvec_t operator<=(intvec_t const& x) const
- {
- return ! (*this > x);
- }
- boolvec_t operator>(intvec_t const& x) const
- {
- return x < *this;
+ // TODO: First compare sign; then if equal, compare sign of difference
+ // TODO: Also look for intrinsics
+ boolvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] < x[i]);
}
- boolvec_t operator>=(intvec_t const& x) const
- {
- return ! (*this < x);
- }
-
- intvec_t abs() const;
- boolvec_t isignbit() const { return as_bool(); }
- intvec_t max(intvec_t x) const;
- intvec_t min(intvec_t x) const;
- };
-
-
-
- template<>
- struct realvec<fp8,32>: floatprops<fp8>
- {
- static int const size = 32;
- typedef real_t scalar_t;
- typedef __m256i vector_t;
- static int const alignment = sizeof(vector_t);
-
- static char const* name() {
+ return r;
+#endif
+ }
+ boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+ boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+ boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+ intvec_t abs() const;
+ boolvec_t isignbit() const { return as_bool(); }
+ intvec_t max(intvec_t x) const;
+ intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<fp8, 32> : floatprops<fp8> {
+ static int const size = 32;
+ typedef real_t scalar_t;
+ typedef __m256i vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static char const *name() {
#ifdef __AVX2__
- return "<AVX2:32*fp8>";
+ return "<AVX2:32*fp8>";
#else
- return "<AVX:32*fp8>";
+ return "<AVX:32*fp8>";
#endif
+ }
+ void barrier() { __asm__("" : "+x"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x) : v(x) {}
+ realvec(real_t a) : v(_mm256_set1_epi8(FP::as_int(a))) {}
+ realvec(real_t const *as)
+ : v(_mm256_set_epi8(
+ FP::as_int(as[31]), FP::as_int(as[30]), FP::as_int(as[29]),
+ FP::as_int(as[28]), FP::as_int(as[27]), FP::as_int(as[26]),
+ FP::as_int(as[25]), FP::as_int(as[24]), FP::as_int(as[23]),
+ FP::as_int(as[22]), FP::as_int(as[21]), FP::as_int(as[20]),
+ FP::as_int(as[19]), FP::as_int(as[18]), FP::as_int(as[17]),
+ FP::as_int(as[16]), FP::as_int(as[15]), FP::as_int(as[14]),
+ FP::as_int(as[13]), FP::as_int(as[12]), FP::as_int(as[11]),
+ FP::as_int(as[10]), FP::as_int(as[9]), FP::as_int(as[8]),
+ FP::as_int(as[7]), FP::as_int(as[6]), FP::as_int(as[5]),
+ FP::as_int(as[4]), FP::as_int(as[3]), FP::as_int(as[2]),
+ FP::as_int(as[1]), FP::as_int(as[0]))) {}
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const {
+ return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+ }
+ realvec_t &set_elt(int n, real_t a) {
+ return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+ }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return _mm256_load_si256((__m256i const *)p);
+ }
+ static realvec_t loadu(real_t const *p) {
+ return _mm256_loadu_si256((__m256i const *)p);
+ }
+ static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
}
- void barrier() { __asm__("": "+x"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(realvec const& x): v(x.v) {}
- // realvec& operator=(realvec const& x) { return v=x.v, *this; }
- realvec(vector_t x): v(x) {}
- realvec(real_t a): v(_mm256_set1_epi8(FP::as_int(a))) {}
- realvec(real_t const* as):
- v(_mm256_set_epi8(FP::as_int(as[31]),
- FP::as_int(as[30]),
- FP::as_int(as[29]),
- FP::as_int(as[28]),
- FP::as_int(as[27]),
- FP::as_int(as[26]),
- FP::as_int(as[25]),
- FP::as_int(as[24]),
- FP::as_int(as[23]),
- FP::as_int(as[22]),
- FP::as_int(as[21]),
- FP::as_int(as[20]),
- FP::as_int(as[19]),
- FP::as_int(as[18]),
- FP::as_int(as[17]),
- FP::as_int(as[16]),
- FP::as_int(as[15]),
- FP::as_int(as[14]),
- FP::as_int(as[13]),
- FP::as_int(as[12]),
- FP::as_int(as[11]),
- FP::as_int(as[10]),
- FP::as_int(as[ 9]),
- FP::as_int(as[ 8]),
- FP::as_int(as[ 7]),
- FP::as_int(as[ 6]),
- FP::as_int(as[ 5]),
- FP::as_int(as[ 4]),
- FP::as_int(as[ 3]),
- FP::as_int(as[ 2]),
- FP::as_int(as[ 1]),
- FP::as_int(as[ 0]))) {}
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const
- {
- return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
- }
- realvec_t& set_elt(int n, real_t a)
- {
- return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
- }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return _mm256_load_si256((__m256i const*)p);
- }
- static realvec_t loadu(real_t const* p)
- {
- return _mm256_loadu_si256((__m256i const*)p);
- }
- static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff);
- return loadu(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return m.m.ifthen(loada(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return m.m.ifthen(loadu(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff, m);
- return loadu(p+ioff, m);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- _mm256_store_si256((__m256i*)p, v);
- }
- void storeu(real_t* p) const
- {
- return _mm256_storeu_si256((__m256i*)p, v);
- }
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff);
- storeu(p+ioff);
- }
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- } else {
- // TODO: this is expensive
- for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
- }
- }
- void storeu(real_t* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- } else {
- // TODO: this is expensive
- for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
- }
- }
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff, m);
- storeu(p+ioff, m);
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
}
-
-
-
- intvec_t as_int() const { return v; }
- intvec_t convert_int() const { __builtin_unreachable(); }
-
-
-
- realvec operator+() const { __builtin_unreachable(); }
- realvec operator-() const { __builtin_unreachable(); }
-
- realvec operator+(realvec x) const { __builtin_unreachable(); }
- realvec operator-(realvec x) const { __builtin_unreachable(); }
- realvec operator*(realvec x) const { __builtin_unreachable(); }
- realvec operator/(realvec x) const { __builtin_unreachable(); }
-
- realvec& operator+=(realvec const& x) { return *this=*this+x; }
- realvec& operator-=(realvec const& x) { return *this=*this-x; }
- realvec& operator*=(realvec const& x) { return *this=*this*x; }
- realvec& operator/=(realvec const& x) { return *this=*this/x; }
-
- real_t maxval() const { __builtin_unreachable(); }
- real_t minval() const { __builtin_unreachable(); }
- real_t prod() const { __builtin_unreachable(); }
- real_t sum() const { __builtin_unreachable(); }
-
-
-
- boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
-
-
-
- realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
- realvec fabs() const { return MF::vml_fabs(*this); }
- intvec_t ilogb() const { return MF::vml_ilogb(*this); }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const { return MF::vml_isnan(*this); }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
- realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
- boolvec_t signbit() const { return v; }
- };
-
-
-
- // boolvec definitions
-
- inline intvec<fp8,32> boolvec<fp8,32>::as_int() const
- {
- return v;
- }
-
- inline intvec<fp8,32> boolvec<fp8,32>::convert_int() const
- {
- return lsr(as_int(), bits-1);
- }
-
- inline
- boolvec<fp8,32> boolvec<fp8,32>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return ifthen(x.as_int(), y.as_int()).as_bool();
- }
-
- inline intvec<fp8,32> boolvec<fp8,32>::ifthen(intvec_t x, intvec_t y) const
- {
- return (( -convert_int() & x) | (~-convert_int() & y));
- }
-
- inline
- realvec<fp8,32> boolvec<fp8,32>::ifthen(realvec_t x, realvec_t y) const
- {
- return ifthen(x.as_int(), y.as_int()).as_float();
- }
-
-
-
- // intvec definitions
-
- inline intvec<fp8,32> intvec<fp8,32>::abs() const
- {
-#ifdef __AVX2__
- return _mm256_abs_epi8(v);
-#else
- return MF::vml_abs(*this);
-#endif
}
-
- inline realvec<fp8,32> intvec<fp8,32>::as_float() const
- {
- return v;
+ realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff, m);
+ return loadu(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ _mm256_store_si256((__m256i *)p, v);
}
-
- inline realvec<fp8,32> intvec<fp8,32>::convert_float() const
- {
- __builtin_unreachable();
+ void storeu(real_t *p) const { return _mm256_storeu_si256((__m256i *)p, v); }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff);
+ storeu(p + ioff);
}
-
- inline intvec<fp8,32> intvec<fp8,32>::max(intvec_t x) const
- {
- return MF::vml_max(*this, x);
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ // TODO: this is expensive
+ for (int n = 0; n < size; ++n)
+ if (m.m[n])
+ p[n] = (*this)[n];
+ }
}
-
- inline intvec<fp8,32> intvec<fp8,32>::min(intvec_t x) const
- {
- return MF::vml_min(*this, x);
+ void storeu(real_t *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ // TODO: this is expensive
+ for (int n = 0; n < size; ++n)
+ if (m.m[n])
+ p[n] = (*this)[n];
+ }
}
-
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff, m);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return v; }
+ intvec_t convert_int() const { __builtin_unreachable(); }
+
+ realvec operator+() const { __builtin_unreachable(); }
+ realvec operator-() const { __builtin_unreachable(); }
+
+ realvec operator+(realvec x) const { __builtin_unreachable(); }
+ realvec operator-(realvec x) const { __builtin_unreachable(); }
+ realvec operator*(realvec x) const { __builtin_unreachable(); }
+ realvec operator/(realvec x) const { __builtin_unreachable(); }
+
+ realvec &operator+=(realvec const &x) { return *this = *this + x; }
+ realvec &operator-=(realvec const &x) { return *this = *this - x; }
+ realvec &operator*=(realvec const &x) { return *this = *this * x; }
+ realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+ real_t maxval() const { __builtin_unreachable(); }
+ real_t minval() const { __builtin_unreachable(); }
+ real_t prod() const { __builtin_unreachable(); }
+ real_t sum() const { __builtin_unreachable(); }
+
+ boolvec_t operator==(realvec const &x) const { __builtin_unreachable(); }
+ boolvec_t operator!=(realvec const &x) const { __builtin_unreachable(); }
+ boolvec_t operator<(realvec const &x) const { __builtin_unreachable(); }
+ boolvec_t operator<=(realvec const &x) const { __builtin_unreachable(); }
+ boolvec_t operator>(realvec const &x) const { __builtin_unreachable(); }
+ boolvec_t operator>=(realvec const &x) const { __builtin_unreachable(); }
+
+ realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+ realvec fabs() const { return MF::vml_fabs(*this); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const { return MF::vml_isnan(*this); }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ boolvec_t signbit() const { return v; }
+};
+
+// boolvec definitions
+
+inline intvec<fp8, 32> boolvec<fp8, 32>::as_int() const { return v; }
+
+inline intvec<fp8, 32> boolvec<fp8, 32>::convert_int() const {
+ return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<fp8, 32> boolvec<fp8, 32>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<fp8, 32> boolvec<fp8, 32>::ifthen(intvec_t x, intvec_t y) const {
+ return ((-convert_int() & x) | (~ - convert_int() & y));
+}
+
+inline realvec<fp8, 32> boolvec<fp8, 32>::ifthen(realvec_t x,
+ realvec_t y) const {
+ return ifthen(x.as_int(), y.as_int()).as_float();
+}
+
+// intvec definitions
+
+inline intvec<fp8, 32> intvec<fp8, 32>::abs() const {
+#ifdef __AVX2__
+ return _mm256_abs_epi8(v);
+#else
+ return MF::vml_abs(*this);
+#endif
+}
+
+inline realvec<fp8, 32> intvec<fp8, 32>::as_float() const { return v; }
+
+inline realvec<fp8, 32> intvec<fp8, 32>::convert_float() const {
+ __builtin_unreachable();
+}
+
+inline intvec<fp8, 32> intvec<fp8, 32>::max(intvec_t x) const {
+ return MF::vml_max(*this, x);
+}
+
+inline intvec<fp8, 32> intvec<fp8, 32>::min(intvec_t x) const {
+ return MF::vml_min(*this, x);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_AVX_FP8_32_H
+#endif // #ifndef VEC_AVX_FP8_32_H
diff --git a/vec_base.h b/vec_base.h
index 737a1e0..81c698d 100644
--- a/vec_base.h
+++ b/vec_base.h
@@ -4,663 +4,544 @@
#define VEC_BASE_H
#ifndef VML_NO_IOSTREAM
-# include <iostream>
+#include <iostream>
#endif
#include "vec_mask.h"
+namespace vecmathlib {
+template <typename real_t, int size> struct boolvec {};
-namespace vecmathlib {
-
- template<typename real_t, int size>
- struct boolvec {
- };
-
- template<typename real_t, int size>
- struct intvec {
- };
-
- template<typename real_t, int size>
- struct realvec {
- };
-
-
-
- // boolvec wrappers
-
- template<typename real_t, int size>
- inline intvec<real_t, size> as_int(boolvec<real_t, size> x)
- {
- return x.as_int();
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> convert_int(boolvec<real_t, size> x)
- {
- return x.convert_int();
- }
-
- template<typename real_t, int size>
- inline bool all(boolvec<real_t, size> x) { return x.all(); }
-
- template<typename real_t, int size>
- inline bool any(boolvec<real_t, size> x) { return x.any(); }
-
- template<typename real_t, int size>
- inline
- boolvec<real_t, size> ifthen(boolvec<real_t, size> c,
- boolvec<real_t, size> x,
- boolvec<real_t, size> y)
- {
- return c.ifthen(x, y);
- }
-
- template<typename real_t, int size>
- inline
- intvec<real_t, size> ifthen(boolvec<real_t, size> c,
- intvec<real_t, size> x,
- intvec<real_t, size> y)
- {
- return c.ifthen(x, y);
- }
-
- template<typename real_t, int size>
- inline
- realvec<real_t, size> ifthen(boolvec<real_t, size> c,
- realvec<real_t, size> x,
- realvec<real_t, size> y)
- {
- return c.ifthen(x, y);
- }
-
-
-
- // intvec wrappers
-
- template<typename real_t, int size>
- inline boolvec<real_t, size> as_bool(intvec<real_t, size> x)
- {
- return x.as_bool();
- }
-
- template<typename real_t, int size>
- inline boolvec<real_t, size> convert_bool(intvec<real_t, size> x)
- {
- return x.convert_bool();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> as_float(intvec<real_t, size> x)
- {
- return x.as_float();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> convert_float(intvec<real_t, size> x)
- {
- return x.convert_float();
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> abs(intvec<real_t, size> x)
- {
- return x.abs();
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> bitifthen(intvec<real_t, size> x,
- intvec<real_t, size> y,
- intvec<real_t, size> z)
- {
- return x.bitifthen(y, z);
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> clz(intvec<real_t, size> x)
- {
- return x.clz();
- }
-
- template<typename real_t, int size>
- inline boolvec<real_t, size> isignbit(intvec<real_t, size> x)
- {
- return x.isignbit();
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> lsr(intvec<real_t, size> x,
- typename intvec<real_t, size>::int_t n)
- {
- return x.lsr(n);
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> lsr(intvec<real_t, size> x,
- intvec<real_t, size> n)
- {
- return x.lsr(n);
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> max(intvec<real_t, size> x,
- intvec<real_t, size> y)
- {
- return x.max(y);
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> min(intvec<real_t, size> x,
- intvec<real_t, size> y)
- {
- return x.min(y);
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> popcount(intvec<real_t, size> x)
- {
- return x.popcount();
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> rotate(intvec<real_t, size> x,
- typename intvec<real_t, size>::int_t n)
- {
- return x.rotate(n);
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> rotate(intvec<real_t, size> x,
- intvec<real_t, size> n)
- {
- return x.rotate(n);
- }
-
-
-
- // realvec wrappers
-
- template<typename real_t, int size>
- inline realvec<real_t, size>
- loada(real_t const* p,
- realvec<real_t, size> x,
- typename realvec<real_t, size>::mask_t const& m)
- {
- return x.loada(p, m);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size>
- loadu(real_t const* p,
- realvec<real_t, size> x,
- typename realvec<real_t, size>::mask_t const& m)
- {
- return x.loadu(p, m);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size>
- loadu(real_t const* p, size_t ioff,
- realvec<real_t, size> x,
- typename realvec<real_t, size>::mask_t const& m)
- {
- return x.loadu(p, ioff, m);
- }
-
- template<typename real_t, int size>
- inline void storea(realvec<real_t, size> x, real_t* p)
- {
- x.storea(p);
- }
-
- template<typename real_t, int size>
- inline void storeu(realvec<real_t, size> x, real_t* p)
- {
- x.storeu(p);
- }
-
- template<typename real_t, int size>
- inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff)
- {
- x.storeu(p, ioff);
- }
-
- template<typename real_t, int size>
- inline void storea(realvec<real_t, size> x, real_t* p,
- typename realvec<real_t, size>::mask_t const& m)
- {
- x.storea(p, m);
- }
-
- template<typename real_t, int size>
- inline void storeu(realvec<real_t, size> x, real_t* p,
- typename realvec<real_t, size>::mask_t const& m)
- {
- x.storeu(p, m);
- }
-
- template<typename real_t, int size>
- inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff,
- typename realvec<real_t, size>::mask_t const &m)
- {
- x.storeu(p, ioff, m);
- }
-
-
-
- template<typename real_t, int size>
- inline intvec<real_t, size> as_int(realvec<real_t, size> x)
- {
- return x.as_int();
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> convert_int(realvec<real_t, size> x)
- {
- return x.convert_int();
- }
-
- template<typename real_t, int size>
- inline
- typename realvec<real_t, size>::real_t maxval(realvec<real_t, size> x)
- {
- return x.maxval();
- }
-
- template<typename real_t, int size>
- inline
- typename realvec<real_t, size>::real_t minval(realvec<real_t, size> x)
- {
- return x.minval();
- }
-
- template<typename real_t, int size>
- inline
- typename realvec<real_t, size>::real_t prod(realvec<real_t, size> x)
- {
- return x.prod();
- }
-
- template<typename real_t, int size>
- inline
- typename realvec<real_t, size>::real_t sum(realvec<real_t, size> x)
- {
- return x.sum();
- }
-
-
-
- template<typename real_t, int size>
- inline realvec<real_t, size> acos(realvec<real_t, size> x)
- {
- return x.acos();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> acosh(realvec<real_t, size> x)
- {
- return x.acosh();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> asin(realvec<real_t, size> x)
- {
- return x.asin();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> asinh(realvec<real_t, size> x)
- {
- return x.asinh();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> atan(realvec<real_t, size> x)
- {
- return x.atan();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> atan2(realvec<real_t, size> x,
- realvec<real_t, size> y)
- {
- return x.atan2(y);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> atanh(realvec<real_t, size> x)
- {
- return x.atanh();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> cbrt(realvec<real_t, size> x)
- {
- return x.cbrt();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> ceil(realvec<real_t, size> x)
- {
- return x.ceil();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> copysign(realvec<real_t, size> x,
- realvec<real_t, size> y)
- {
- return x.copysign(y);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> cos(realvec<real_t, size> x)
- {
- return x.cos();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> cosh(realvec<real_t, size> x)
- {
- return x.cosh();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> exp(realvec<real_t, size> x)
- {
- return x.exp();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> exp10(realvec<real_t, size> x)
- {
- return x.exp10();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> exp2(realvec<real_t, size> x)
- {
- return x.exp2();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> expm1(realvec<real_t, size> x)
- {
- return x.expm1();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> fabs(realvec<real_t, size> x)
- {
- return x.fabs();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> floor(realvec<real_t, size> x)
- {
- return x.floor();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> fdim(realvec<real_t, size> x,
- realvec<real_t, size> y)
- {
- return x.fdim(y);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> fma(realvec<real_t, size> x,
- realvec<real_t, size> y,
- realvec<real_t, size> z)
- {
- return x.fma(y, z);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> fmax(realvec<real_t, size> x,
- realvec<real_t, size> y)
- {
- return x.fmax(y);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> fmin(realvec<real_t, size> x,
- realvec<real_t, size> y)
- {
- return x.fmin(y);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> fmod(realvec<real_t, size> x,
- realvec<real_t, size> y)
- {
- return x.fmod(y);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> frexp(realvec<real_t, size> x,
- intvec<real_t, size>* r)
- {
- return x.frexp(r);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> hypot(realvec<real_t, size> x,
- realvec<real_t, size> y)
- {
- return x.hypot(y);
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> ilogb(realvec<real_t, size> x)
- {
- return x.ilogb();
- }
-
- template<typename real_t, int size>
- inline boolvec<real_t, size> isfinite(realvec<real_t, size> x)
- {
- return x.isfinite();
- }
-
- template<typename real_t, int size>
- inline boolvec<real_t, size> isinf(realvec<real_t, size> x)
- {
- return x.isinf();
- }
-
- template<typename real_t, int size>
- inline boolvec<real_t, size> isnan(realvec<real_t, size> x)
- {
- return x.isnan();
- }
-
- template<typename real_t, int size>
- inline boolvec<real_t, size> isnormal(realvec<real_t, size> x)
- {
- return x.isnormal();
- }
-
- template<typename real_t, int size>
- inline
- realvec<real_t, size> ldexp(realvec<real_t, size> x,
- typename intvec<real_t, size>::int_t n)
- {
- return x.ldexp(n);
- }
-
- template<typename real_t, int size>
- inline
- realvec<real_t, size> ldexp(realvec<real_t, size> x,
- intvec<real_t, size> n)
- {
- return x.ldexp(n);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> log(realvec<real_t, size> x)
- {
- return x.log();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> log10(realvec<real_t, size> x)
- {
- return x.log10();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> log1p(realvec<real_t, size> x)
- {
- return x.log1p();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> log2(realvec<real_t, size> x)
- {
- return x.log2();
- }
-
- template<typename real_t, int size>
- inline intvec<real_t, size> lrint(realvec<real_t, size> x)
- {
- return x.lrint();
- }
+template <typename real_t, int size> struct intvec {};
+
+template <typename real_t, int size> struct realvec {};
+
+// boolvec wrappers
+
+template <typename real_t, int size>
+inline intvec<real_t, size> as_int(boolvec<real_t, size> x) {
+ return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> convert_int(boolvec<real_t, size> x) {
+ return x.convert_int();
+}
+
+template <typename real_t, int size> inline bool all(boolvec<real_t, size> x) {
+ return x.all();
+}
+
+template <typename real_t, int size> inline bool any(boolvec<real_t, size> x) {
+ return x.any();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> ifthen(boolvec<real_t, size> c,
+ boolvec<real_t, size> x,
+ boolvec<real_t, size> y) {
+ return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> ifthen(boolvec<real_t, size> c,
+ intvec<real_t, size> x,
+ intvec<real_t, size> y) {
+ return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> ifthen(boolvec<real_t, size> c,
+ realvec<real_t, size> x,
+ realvec<real_t, size> y) {
+ return c.ifthen(x, y);
+}
+
+// intvec wrappers
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> as_bool(intvec<real_t, size> x) {
+ return x.as_bool();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> convert_bool(intvec<real_t, size> x) {
+ return x.convert_bool();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> as_float(intvec<real_t, size> x) {
+ return x.as_float();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> convert_float(intvec<real_t, size> x) {
+ return x.convert_float();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> abs(intvec<real_t, size> x) {
+ return x.abs();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> bitifthen(intvec<real_t, size> x,
+ intvec<real_t, size> y,
+ intvec<real_t, size> z) {
+ return x.bitifthen(y, z);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> clz(intvec<real_t, size> x) {
+ return x.clz();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isignbit(intvec<real_t, size> x) {
+ return x.isignbit();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> lsr(intvec<real_t, size> x,
+ typename intvec<real_t, size>::int_t n) {
+ return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> lsr(intvec<real_t, size> x,
+ intvec<real_t, size> n) {
+ return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> max(intvec<real_t, size> x,
+ intvec<real_t, size> y) {
+ return x.max(y);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> min(intvec<real_t, size> x,
+ intvec<real_t, size> y) {
+ return x.min(y);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> popcount(intvec<real_t, size> x) {
+ return x.popcount();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> rotate(intvec<real_t, size> x,
+ typename intvec<real_t, size>::int_t n) {
+ return x.rotate(n);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> rotate(intvec<real_t, size> x,
+ intvec<real_t, size> n) {
+ return x.rotate(n);
+}
+
+// realvec wrappers
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+loada(real_t const *p, realvec<real_t, size> x,
+ typename realvec<real_t, size>::mask_t const &m) {
+ return x.loada(p, m);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+loadu(real_t const *p, realvec<real_t, size> x,
+ typename realvec<real_t, size>::mask_t const &m) {
+ return x.loadu(p, m);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+loadu(real_t const *p, size_t ioff, realvec<real_t, size> x,
+ typename realvec<real_t, size>::mask_t const &m) {
+ return x.loadu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline void storea(realvec<real_t, size> x, real_t *p) {
+ x.storea(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realvec<real_t, size> x, real_t *p) {
+ x.storeu(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realvec<real_t, size> x, real_t *p, size_t ioff) {
+ x.storeu(p, ioff);
+}
+
+template <typename real_t, int size>
+inline void storea(realvec<real_t, size> x, real_t *p,
+ typename realvec<real_t, size>::mask_t const &m) {
+ x.storea(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realvec<real_t, size> x, real_t *p,
+ typename realvec<real_t, size>::mask_t const &m) {
+ x.storeu(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realvec<real_t, size> x, real_t *p, size_t ioff,
+ typename realvec<real_t, size>::mask_t const &m) {
+ x.storeu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> as_int(realvec<real_t, size> x) {
+ return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> convert_int(realvec<real_t, size> x) {
+ return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline typename realvec<real_t, size>::real_t maxval(realvec<real_t, size> x) {
+ return x.maxval();
+}
+
+template <typename real_t, int size>
+inline typename realvec<real_t, size>::real_t minval(realvec<real_t, size> x) {
+ return x.minval();
+}
+
+template <typename real_t, int size>
+inline typename realvec<real_t, size>::real_t prod(realvec<real_t, size> x) {
+ return x.prod();
+}
+
+template <typename real_t, int size>
+inline typename realvec<real_t, size>::real_t sum(realvec<real_t, size> x) {
+ return x.sum();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> acos(realvec<real_t, size> x) {
+ return x.acos();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> acosh(realvec<real_t, size> x) {
+ return x.acosh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> asin(realvec<real_t, size> x) {
+ return x.asin();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> asinh(realvec<real_t, size> x) {
+ return x.asinh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> atan(realvec<real_t, size> x) {
+ return x.atan();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> atan2(realvec<real_t, size> x,
+ realvec<real_t, size> y) {
+ return x.atan2(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> atanh(realvec<real_t, size> x) {
+ return x.atanh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> cbrt(realvec<real_t, size> x) {
+ return x.cbrt();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> ceil(realvec<real_t, size> x) {
+ return x.ceil();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> copysign(realvec<real_t, size> x,
+ realvec<real_t, size> y) {
+ return x.copysign(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> cos(realvec<real_t, size> x) {
+ return x.cos();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> cosh(realvec<real_t, size> x) {
+ return x.cosh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> exp(realvec<real_t, size> x) {
+ return x.exp();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> exp10(realvec<real_t, size> x) {
+ return x.exp10();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> exp2(realvec<real_t, size> x) {
+ return x.exp2();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> expm1(realvec<real_t, size> x) {
+ return x.expm1();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fabs(realvec<real_t, size> x) {
+ return x.fabs();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> floor(realvec<real_t, size> x) {
+ return x.floor();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fdim(realvec<real_t, size> x,
+ realvec<real_t, size> y) {
+ return x.fdim(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+fma(realvec<real_t, size> x, realvec<real_t, size> y, realvec<real_t, size> z) {
+ return x.fma(y, z);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fmax(realvec<real_t, size> x,
+ realvec<real_t, size> y) {
+ return x.fmax(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fmin(realvec<real_t, size> x,
+ realvec<real_t, size> y) {
+ return x.fmin(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> fmod(realvec<real_t, size> x,
+ realvec<real_t, size> y) {
+ return x.fmod(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> frexp(realvec<real_t, size> x,
+ intvec<real_t, size> *r) {
+ return x.frexp(r);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> hypot(realvec<real_t, size> x,
+ realvec<real_t, size> y) {
+ return x.hypot(y);
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> ilogb(realvec<real_t, size> x) {
+ return x.ilogb();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isfinite(realvec<real_t, size> x) {
+ return x.isfinite();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isinf(realvec<real_t, size> x) {
+ return x.isinf();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isnan(realvec<real_t, size> x) {
+ return x.isnan();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> isnormal(realvec<real_t, size> x) {
+ return x.isnormal();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> ldexp(realvec<real_t, size> x,
+ typename intvec<real_t, size>::int_t n) {
+ return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> ldexp(realvec<real_t, size> x,
+ intvec<real_t, size> n) {
+ return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> log(realvec<real_t, size> x) {
+ return x.log();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> log10(realvec<real_t, size> x) {
+ return x.log10();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> log1p(realvec<real_t, size> x) {
+ return x.log1p();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> log2(realvec<real_t, size> x) {
+ return x.log2();
+}
+
+template <typename real_t, int size>
+inline intvec<real_t, size> lrint(realvec<real_t, size> x) {
+ return x.lrint();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size>
+mad(realvec<real_t, size> x, realvec<real_t, size> y, realvec<real_t, size> z) {
+ return x.mad(y, z);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> nextafter(realvec<real_t, size> x,
+ realvec<real_t, size> y) {
+ return x.nextafter(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> pow(realvec<real_t, size> x,
+ realvec<real_t, size> y) {
+ return x.pow(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> rcp(realvec<real_t, size> x) {
+ return x.rcp();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> remainder(realvec<real_t, size> x,
+ realvec<real_t, size> y) {
+ return x.remainder(y);
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> rint(realvec<real_t, size> x) {
+ return x.rint();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> round(realvec<real_t, size> x) {
+ return x.round();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> rsqrt(realvec<real_t, size> x) {
+ return x.rsqrt();
+}
+
+template <typename real_t, int size>
+inline boolvec<real_t, size> signbit(realvec<real_t, size> x) {
+ return x.signbit();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> sin(realvec<real_t, size> x) {
+ return x.sin();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> sinh(realvec<real_t, size> x) {
+ return x.sinh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> sqrt(realvec<real_t, size> x) {
+ return x.sqrt();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> tan(realvec<real_t, size> x) {
+ return x.tan();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> tanh(realvec<real_t, size> x) {
+ return x.tanh();
+}
+
+template <typename real_t, int size>
+inline realvec<real_t, size> trunc(realvec<real_t, size> x) {
+ return x.trunc();
+}
- template<typename real_t, int size>
- inline realvec<real_t, size> mad(realvec<real_t, size> x,
- realvec<real_t, size> y,
- realvec<real_t, size> z)
- {
- return x.mad(y, z);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> nextafter(realvec<real_t, size> x,
- realvec<real_t, size> y)
- {
- return x.nextafter(y);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> pow(realvec<real_t, size> x,
- realvec<real_t, size> y)
- {
- return x.pow(y);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> rcp(realvec<real_t, size> x)
- {
- return x.rcp();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> remainder(realvec<real_t, size> x,
- realvec<real_t, size> y)
- {
- return x.remainder(y);
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> rint(realvec<real_t, size> x)
- {
- return x.rint();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> round(realvec<real_t, size> x)
- {
- return x.round();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> rsqrt(realvec<real_t, size> x)
- {
- return x.rsqrt();
- }
-
- template<typename real_t, int size>
- inline boolvec<real_t, size> signbit(realvec<real_t, size> x)
- {
- return x.signbit();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> sin(realvec<real_t, size> x)
- {
- return x.sin();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> sinh(realvec<real_t, size> x)
- {
- return x.sinh();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> sqrt(realvec<real_t, size> x)
- {
- return x.sqrt();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> tan(realvec<real_t, size> x)
- {
- return x.tan();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> tanh(realvec<real_t, size> x)
- {
- return x.tanh();
- }
-
- template<typename real_t, int size>
- inline realvec<real_t, size> trunc(realvec<real_t, size> x)
- {
- return x.trunc();
- }
-
-
-
#ifndef VML_NO_IOSTREAM
- template<typename real_t, int size>
- std::ostream& operator<<(std::ostream& os, boolvec<real_t, size> const& x)
- {
- os << "[";
- for (int i=0; i<size; ++i) {
- if (i!=0) os << ",";
- os << x[i];
- }
- os << "]";
- return os;
- }
-
- template<typename real_t, int size>
- std::ostream& operator<<(std::ostream& os, intvec<real_t, size> const& x)
- {
- os << "[";
- for (int i=0; i<size; ++i) {
- if (i!=0) os << ",";
- os << x[i];
- }
- os << "]";
- return os;
- }
-
- template<typename real_t, int size>
- std::ostream& operator<<(std::ostream& os, realvec<real_t, size> const& x)
- {
- os << "[";
- for (int i=0; i<size; ++i) {
- if (i!=0) os << ",";
- os << x[i];
- }
- os << "]";
- return os;
- }
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, boolvec<real_t, size> const &x) {
+ os << "[";
+ for (int i = 0; i < size; ++i) {
+ if (i != 0)
+ os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, intvec<real_t, size> const &x) {
+ os << "[";
+ for (int i = 0; i < size; ++i) {
+ if (i != 0)
+ os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, realvec<real_t, size> const &x) {
+ os << "[";
+ for (int i = 0; i < size; ++i) {
+ if (i != 0)
+ os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+}
#endif
-
+
} // namespace vecmathlib
-#endif // #ifndef VEC_BASE_H
+#endif // #ifndef VEC_BASE_H
diff --git a/vec_builtin.h b/vec_builtin.h
index bbe4277..2f1ff90 100644
--- a/vec_builtin.h
+++ b/vec_builtin.h
@@ -12,1450 +12,1253 @@
#include <cmath>
#include <cstring>
#ifndef VML_NO_IOSTREAM
-# include <sstream>
+#include <sstream>
#endif
#include <string>
+namespace vecmathlib {
+template <typename T, int N> struct boolbuiltinvec;
+template <typename T, int N> struct intbuiltinvec;
+template <typename T, int N> struct realbuiltinvec;
-namespace vecmathlib {
-
- template<typename T, int N> struct boolbuiltinvec;
- template<typename T, int N> struct intbuiltinvec;
- template<typename T, int N> struct realbuiltinvec;
-
-
-
- template<typename T, int N>
- struct boolbuiltinvec: floatprops<T>
- {
- typedef typename floatprops<T>::int_t int_t;
- typedef typename floatprops<T>::uint_t uint_t;
- typedef typename floatprops<T>::real_t real_t;
-
- static const int size = N;
- typedef bool scalar_t;
- typedef int_t bvector_t __attribute__((__ext_vector_type__(N)));
- static const int alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- private:
- // true is -1, false is 0
- static int_t from_bool(bool a) { return -uint_t(a); }
- static bool to_bool(int_t a) { return a; }
- public:
-
- typedef boolbuiltinvec boolvec_t;
- typedef intbuiltinvec<real_t, size> intvec_t;
- typedef realbuiltinvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolbuiltinvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolbuiltinvec(const boolbuiltinvec& x): v(x.v) {}
- // boolbuiltinvec& operator=(const boolbuiltinvec& x) { return v=x.v, *this; }
- // Can't have a constructor from bvector_t, since this would
- // conflict with the constructor from bool
- // boolbuiltinvec(bvector_t x): v(x) {}
- static boolvec_t mkvec(bvector_t x) { boolvec_t res; res.v=x; return res; }
- boolbuiltinvec(bool a): v(from_bool(a)) {}
- boolbuiltinvec(const bool* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const { return to_bool(v[n]); }
- boolvec_t& set_elt(int n, bool a) { return v[n]=from_bool(a), *this; }
-
-
-
- intvec_t as_int() const; // defined after intbuiltinvec
- intvec_t convert_int() const; // defined after intbuiltinvec
-
-
-
- boolvec_t operator!() const { return mkvec(!v); }
-
- boolvec_t operator&&(boolvec_t x) const { return mkvec(v && x.v); }
- boolvec_t operator||(boolvec_t x) const { return mkvec(v || x.v); }
- boolvec_t operator==(boolvec_t x) const { return mkvec(v == x.v); }
- boolvec_t operator!=(boolvec_t x) const { return mkvec(v != x.v); }
-
- bool all() const
- {
- bool res = (*this)[0];
- for (int d=1; d<size; ++d) res = res && (*this)[d];
- return res;
- }
- bool any() const
- {
- bool res = (*this)[0];
- for (int d=1; d<size; ++d) res = res || (*this)[d];
- return res;
- }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intbuiltinvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realbuiltinvec
- };
-
-
-
- template<typename T, int N>
- struct intbuiltinvec: floatprops<T>
- {
- typedef typename floatprops<T>::int_t int_t;
- typedef typename floatprops<T>::uint_t uint_t;
- typedef typename floatprops<T>::real_t real_t;
-
- static const int size = N;
- typedef int_t scalar_t;
- typedef int_t ivector_t __attribute__((__ext_vector_type__(N)));
- typedef uint_t uvector_t __attribute__((__ext_vector_type__(N)));
- static const int alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
- static_assert(size * sizeof(real_t) == sizeof(uvector_t),
- "vector size is wrong");
-
- typedef boolbuiltinvec<real_t, size> boolvec_t;
- typedef intbuiltinvec intvec_t;
- typedef realbuiltinvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intbuiltinvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intbuiltinvec(const intbuiltinvec& x): v(x.v) {}
- // intbuiltinvec& operator=(const intbuiltinvec& x) { return v=x.v, *this; }
- // Can't have a constructor from ivector_t, since this would
- // conflict with the constructor from int_t
- // intbuiltinvec(ivector_t x): v(x) {}
- static intvec_t mkvec(ivector_t x) { intvec_t res; res.v=x; return res; }
- intbuiltinvec(int_t a): v(a) {}
- intbuiltinvec(const int_t* as) { std::memcpy(&v, as, sizeof v); }
- static intvec_t iota()
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.set_elt(d, d);
- return res;
- }
-
- int_t operator[](int n) const { return v[n]; }
- intvec_t& set_elt(int n, int_t a) { return v[n]=a, *this; }
-
-
-
- boolvec_t as_bool() const
- {
- boolvec_t res;
- std::memcpy(&res.v, &v, sizeof res.v);
- return res;
- }
- boolvec_t convert_bool() const { return *this != IV(I(0)); }
- realvec_t as_float() const; // defined after realbuiltinvec
- realvec_t convert_float() const; // defined after realbuiltinvec
-
-
-
- intvec_t operator+() const { return mkvec(+v); }
- intvec_t operator-() const { return mkvec(-v); }
-
- intvec_t operator+(intvec_t x) const { return mkvec(v + x.v); }
- intvec_t operator-(intvec_t x) const { return mkvec(v - x.v); }
- intvec_t operator*(intvec_t x) const { return mkvec(v * x.v); }
- intvec_t operator/(intvec_t x) const { return mkvec(v / x.v); }
- intvec_t operator%(intvec_t x) const { return mkvec(v % x.v); }
-
- intvec_t& operator+=(const intvec_t& x) { return *this=*this+x; }
- intvec_t& operator-=(const intvec_t& x) { return *this=*this-x; }
- intvec_t& operator*=(const intvec_t& x) { return *this=*this*x; }
- intvec_t& operator/=(const intvec_t& x) { return *this=*this/x; }
- intvec_t& operator%=(const intvec_t& x) { return *this=*this%x; }
-
-
-
- intvec_t operator~() const { return mkvec(~v); }
-
- intvec_t operator&(intvec_t x) const { return mkvec(v & x.v); }
- intvec_t operator|(intvec_t x) const { return mkvec(v | x.v); }
- intvec_t operator^(intvec_t x) const { return mkvec(v ^ x.v); }
-
- intvec_t& operator&=(const intvec_t& x) { return *this=*this&x; }
- intvec_t& operator|=(const intvec_t& x) { return *this=*this|x; }
- intvec_t& operator^=(const intvec_t& x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
-
-
- intvec_t lsr(int_t n) const
- {
- return mkvec(ivector_t(uvector_t(v) >> U(n)));
- }
- intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); }
- intvec_t operator>>(int_t n) const { return mkvec(v >> n); }
- intvec_t operator<<(int_t n) const { return mkvec(v << n); }
-
- intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec_t lsr(intvec_t n) const
- {
- return mkvec(ivector_t(uvector_t(v)>>uvector_t(n.v)));
- }
- intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); }
- intvec_t operator>>(intvec_t n) const { return mkvec(v >> n.v); }
- intvec_t operator<<(intvec_t n) const { return mkvec(v << n.v); }
-
- intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-
- intvec_t clz() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) {
- int_t val = (*this)[d];
- int_t cnt = val == 0 ? CHAR_BIT * sizeof val : builtin_clz(U(val));
- res.set_elt(d, cnt);
- }
- return res;
- }
- intvec_t popcount() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) {
- res.set_elt(d, builtin_popcount(U((*this)[d])));
- }
- return res;
- }
-
-
-
- boolvec_t operator==(const intvec_t& x) const
- {
- return boolvec_t::mkvec(v == x.v);
- }
- boolvec_t operator!=(const intvec_t& x) const
- {
- return boolvec_t::mkvec(v != x.v);
- }
- boolvec_t operator<(const intvec_t& x) const
- {
- return boolvec_t::mkvec(v < x.v);
- }
- boolvec_t operator<=(const intvec_t& x) const
- {
- return boolvec_t::mkvec(v <= x.v);
- }
- boolvec_t operator>(const intvec_t& x) const
- {
- return boolvec_t::mkvec(v > x.v);
- }
- boolvec_t operator>=(const intvec_t& x) const
- {
- return boolvec_t::mkvec(v >= x.v);
+template <typename T, int N> struct boolbuiltinvec : floatprops<T> {
+ typedef typename floatprops<T>::int_t int_t;
+ typedef typename floatprops<T>::uint_t uint_t;
+ typedef typename floatprops<T>::real_t real_t;
+
+ static const int size = N;
+ typedef bool scalar_t;
+ typedef int_t bvector_t __attribute__((__ext_vector_type__(N)));
+ static const int alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+private:
+ // true is -1, false is 0
+ static int_t from_bool(bool a) { return -uint_t(a); }
+ static bool to_bool(int_t a) { return a; }
+
+public:
+ typedef boolbuiltinvec boolvec_t;
+ typedef intbuiltinvec<real_t, size> intvec_t;
+ typedef realbuiltinvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolbuiltinvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolbuiltinvec(const boolbuiltinvec& x): v(x.v) {}
+ // boolbuiltinvec& operator=(const boolbuiltinvec& x) { return v=x.v, *this; }
+ // Can't have a constructor from bvector_t, since this would
+ // conflict with the constructor from bool
+ // boolbuiltinvec(bvector_t x): v(x) {}
+ static boolvec_t mkvec(bvector_t x) {
+ boolvec_t res;
+ res.v = x;
+ return res;
+ }
+ boolbuiltinvec(bool a) : v(from_bool(a)) {}
+ boolbuiltinvec(const bool *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const { return to_bool(v[n]); }
+ boolvec_t &set_elt(int n, bool a) { return v[n] = from_bool(a), *this; }
+
+ intvec_t as_int() const; // defined after intbuiltinvec
+ intvec_t convert_int() const; // defined after intbuiltinvec
+
+ boolvec_t operator!() const { return mkvec(!v); }
+
+ boolvec_t operator&&(boolvec_t x) const { return mkvec(v && x.v); }
+ boolvec_t operator||(boolvec_t x) const { return mkvec(v || x.v); }
+ boolvec_t operator==(boolvec_t x) const { return mkvec(v == x.v); }
+ boolvec_t operator!=(boolvec_t x) const { return mkvec(v != x.v); }
+
+ bool all() const {
+ bool res = (*this)[0];
+ for (int d = 1; d < size; ++d)
+ res = res && (*this)[d];
+ return res;
+ }
+ bool any() const {
+ bool res = (*this)[0];
+ for (int d = 1; d < size; ++d)
+ res = res || (*this)[d];
+ return res;
+ }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intbuiltinvec
+ realvec_t ifthen(realvec_t x,
+ realvec_t y) const; // defined after realbuiltinvec
+};
+
+template <typename T, int N> struct intbuiltinvec : floatprops<T> {
+ typedef typename floatprops<T>::int_t int_t;
+ typedef typename floatprops<T>::uint_t uint_t;
+ typedef typename floatprops<T>::real_t real_t;
+
+ static const int size = N;
+ typedef int_t scalar_t;
+ typedef int_t ivector_t __attribute__((__ext_vector_type__(N)));
+ typedef uint_t uvector_t __attribute__((__ext_vector_type__(N)));
+ static const int alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+ static_assert(size * sizeof(real_t) == sizeof(uvector_t),
+ "vector size is wrong");
+
+ typedef boolbuiltinvec<real_t, size> boolvec_t;
+ typedef intbuiltinvec intvec_t;
+ typedef realbuiltinvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intbuiltinvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intbuiltinvec(const intbuiltinvec& x): v(x.v) {}
+ // intbuiltinvec& operator=(const intbuiltinvec& x) { return v=x.v, *this; }
+ // Can't have a constructor from ivector_t, since this would
+ // conflict with the constructor from int_t
+ // intbuiltinvec(ivector_t x): v(x) {}
+ static intvec_t mkvec(ivector_t x) {
+ intvec_t res;
+ res.v = x;
+ return res;
+ }
+ intbuiltinvec(int_t a) : v(a) {}
+ intbuiltinvec(const int_t *as) { std::memcpy(&v, as, sizeof v); }
+ static intvec_t iota() {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.set_elt(d, d);
+ return res;
+ }
+
+ int_t operator[](int n) const { return v[n]; }
+ intvec_t &set_elt(int n, int_t a) { return v[n] = a, *this; }
+
+ boolvec_t as_bool() const {
+ boolvec_t res;
+ std::memcpy(&res.v, &v, sizeof res.v);
+ return res;
+ }
+ boolvec_t convert_bool() const { return *this != IV(I(0)); }
+ realvec_t as_float() const; // defined after realbuiltinvec
+ realvec_t convert_float() const; // defined after realbuiltinvec
+
+ intvec_t operator+() const { return mkvec(+v); }
+ intvec_t operator-() const { return mkvec(-v); }
+
+ intvec_t operator+(intvec_t x) const { return mkvec(v + x.v); }
+ intvec_t operator-(intvec_t x) const { return mkvec(v - x.v); }
+ intvec_t operator*(intvec_t x) const { return mkvec(v * x.v); }
+ intvec_t operator/(intvec_t x) const { return mkvec(v / x.v); }
+ intvec_t operator%(intvec_t x) const { return mkvec(v % x.v); }
+
+ intvec_t &operator+=(const intvec_t &x) { return *this = *this + x; }
+ intvec_t &operator-=(const intvec_t &x) { return *this = *this - x; }
+ intvec_t &operator*=(const intvec_t &x) { return *this = *this * x; }
+ intvec_t &operator/=(const intvec_t &x) { return *this = *this / x; }
+ intvec_t &operator%=(const intvec_t &x) { return *this = *this % x; }
+
+ intvec_t operator~() const { return mkvec(~v); }
+
+ intvec_t operator&(intvec_t x) const { return mkvec(v & x.v); }
+ intvec_t operator|(intvec_t x) const { return mkvec(v | x.v); }
+ intvec_t operator^(intvec_t x) const { return mkvec(v ^ x.v); }
+
+ intvec_t &operator&=(const intvec_t &x) { return *this = *this & x; }
+ intvec_t &operator|=(const intvec_t &x) { return *this = *this | x; }
+ intvec_t &operator^=(const intvec_t &x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+ }
+
+ intvec_t lsr(int_t n) const { return mkvec(ivector_t(uvector_t(v) >> U(n))); }
+ intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); }
+ intvec_t operator>>(int_t n) const { return mkvec(v >> n); }
+ intvec_t operator<<(int_t n) const { return mkvec(v << n); }
+
+ intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec_t lsr(intvec_t n) const {
+ return mkvec(ivector_t(uvector_t(v) >> uvector_t(n.v)));
+ }
+ intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); }
+ intvec_t operator>>(intvec_t n) const { return mkvec(v >> n.v); }
+ intvec_t operator<<(intvec_t n) const { return mkvec(v << n.v); }
+
+ intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+ intvec_t clz() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d) {
+ int_t val = (*this)[d];
+ int_t cnt = val == 0 ? CHAR_BIT * sizeof val : builtin_clz(U(val));
+ res.set_elt(d, cnt);
}
-
- intvec_t abs() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.set_elt(d, builtin_abs((*this)[d]));
- return res;
+ return res;
+ }
+ intvec_t popcount() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d) {
+ res.set_elt(d, builtin_popcount(U((*this)[d])));
}
-
- boolvec_t isignbit() const { return MF::vml_isignbit(*this); }
-
- intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); }
- intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); }
- };
-
-
-
- template<typename T, int N>
- struct realbuiltinvec: floatprops<T>
- {
- typedef typename floatprops<T>::int_t int_t;
- typedef typename floatprops<T>::uint_t uint_t;
- typedef typename floatprops<T>::real_t real_t;
-
- static const int size = N;
- typedef real_t scalar_t;
- typedef real_t vector_t __attribute__((__ext_vector_type__(N)));
- static const int alignment = sizeof(vector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
+ return res;
+ }
+
+ boolvec_t operator==(const intvec_t &x) const {
+ return boolvec_t::mkvec(v == x.v);
+ }
+ boolvec_t operator!=(const intvec_t &x) const {
+ return boolvec_t::mkvec(v != x.v);
+ }
+ boolvec_t operator<(const intvec_t &x) const {
+ return boolvec_t::mkvec(v < x.v);
+ }
+ boolvec_t operator<=(const intvec_t &x) const {
+ return boolvec_t::mkvec(v <= x.v);
+ }
+ boolvec_t operator>(const intvec_t &x) const {
+ return boolvec_t::mkvec(v > x.v);
+ }
+ boolvec_t operator>=(const intvec_t &x) const {
+ return boolvec_t::mkvec(v >= x.v);
+ }
+
+ intvec_t abs() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.set_elt(d, builtin_abs((*this)[d]));
+ return res;
+ }
+
+ boolvec_t isignbit() const { return MF::vml_isignbit(*this); }
+
+ intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); }
+ intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); }
+};
+
+template <typename T, int N> struct realbuiltinvec : floatprops<T> {
+ typedef typename floatprops<T>::int_t int_t;
+ typedef typename floatprops<T>::uint_t uint_t;
+ typedef typename floatprops<T>::real_t real_t;
+
+ static const int size = N;
+ typedef real_t scalar_t;
+ typedef real_t vector_t __attribute__((__ext_vector_type__(N)));
+ static const int alignment = sizeof(vector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
#ifndef VML_NO_IOSTREAM
- static const char* name()
- {
- static std::string name_;
- if (name_.empty()) {
- std::stringstream buf;
- buf << "<builtin:" << N << "*" << FP::name() << ">";
- name_ = buf.str();
- }
- return name_.c_str();
+ static const char *name() {
+ static std::string name_;
+ if (name_.empty()) {
+ std::stringstream buf;
+ buf << "<builtin:" << N << "*" << FP::name() << ">";
+ name_ = buf.str();
}
+ return name_.c_str();
+ }
#endif
- void barrier() { volatile vector_t x __attribute__((__unused__)) = v; }
-
- typedef boolbuiltinvec<real_t, size> boolvec_t;
- typedef intbuiltinvec<real_t, size> intvec_t;
- typedef realbuiltinvec realvec_t;
-
- private:
- boolvec_t mapb(bool f(real_t)) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
- return res;
- }
- intvec_t map(int_t f(real_t)) const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
- return res;
- }
- realvec_t map(real_t f(real_t)) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
- return res;
- }
- realvec_t map(real_t f(real_t, int_t), intvec_t x) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]);
- return res;
- }
- realvec_t map(real_t f(real_t, int_t*), intvec_t* x) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) {
- int_t ix;
- res.v[d] = f(v[d], &ix);
- x->set_elt(d, ix);
- }
- return res;
- }
- realvec_t map(real_t f(real_t, real_t), realvec_t x) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]);
- return res;
- }
- realvec_t map(real_t f(real_t, real_t, real_t),
- realvec_t x, realvec_t y) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d], y.v[d]);
- return res;
+ void barrier() { volatile vector_t x __attribute__((__unused__)) = v; }
+
+ typedef boolbuiltinvec<real_t, size> boolvec_t;
+ typedef intbuiltinvec<real_t, size> intvec_t;
+ typedef realbuiltinvec realvec_t;
+
+private:
+ boolvec_t mapb(bool f(real_t)) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = f(v[d]);
+ return res;
+ }
+ intvec_t map(int_t f(real_t)) const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = f(v[d]);
+ return res;
+ }
+ realvec_t map(real_t f(real_t)) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = f(v[d]);
+ return res;
+ }
+ realvec_t map(real_t f(real_t, int_t), intvec_t x) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = f(v[d], x.v[d]);
+ return res;
+ }
+ realvec_t map(real_t f(real_t, int_t *), intvec_t *x) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d) {
+ int_t ix;
+ res.v[d] = f(v[d], &ix);
+ x->set_elt(d, ix);
}
- public:
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realbuiltinvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realbuiltinvec(const realbuiltinvec& x): v(x.v) {}
- // realbuiltinvec& operator=(const realbuiltinvec& x) { return v=x.v, *this; }
- // Can't have a constructor from vector_t, since this would
- // conflict with the constructor from real_t
- // realbuiltinvec(vector_t x): v(x) {}
- static realvec_t mkvec(vector_t x) { realvec_t res; res.v=x; return res; }
- realbuiltinvec(real_t a): v(a) {}
- realbuiltinvec(const real_t* as) { std::memcpy(&v, as, sizeof v); }
-
- real_t operator[](int n) const { return v[n]; }
- realvec_t& set_elt(int n, real_t a) { return v[n]=a, *this; }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(const real_t* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
+ return res;
+ }
+ realvec_t map(real_t f(real_t, real_t), realvec_t x) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = f(v[d], x.v[d]);
+ return res;
+ }
+ realvec_t map(real_t f(real_t, real_t, real_t), realvec_t x,
+ realvec_t y) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = f(v[d], x.v[d], y.v[d]);
+ return res;
+ }
+
+public:
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realbuiltinvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realbuiltinvec(const realbuiltinvec& x): v(x.v) {}
+ // realbuiltinvec& operator=(const realbuiltinvec& x) { return v=x.v, *this; }
+ // Can't have a constructor from vector_t, since this would
+ // conflict with the constructor from real_t
+ // realbuiltinvec(vector_t x): v(x) {}
+ static realvec_t mkvec(vector_t x) {
+ realvec_t res;
+ res.v = x;
+ return res;
+ }
+ realbuiltinvec(real_t a) : v(a) {}
+ realbuiltinvec(const real_t *as) { std::memcpy(&v, as, sizeof v); }
+
+ real_t operator[](int n) const { return v[n]; }
+ realvec_t &set_elt(int n, real_t a) { return v[n] = a, *this; }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(const real_t *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
#if __has_builtin(__builtin_assume_aligned)
- p = (const real_t*)__builtin_assume_aligned(p, sizeof(realvec_t));
+ p = (const real_t *)__builtin_assume_aligned(p, sizeof(realvec_t));
#endif
- return mkvec(*(const vector_t*)p);
- }
- static realvec_t loadu(const real_t* p)
- {
- // return mkvec(*(const vector_t*)p);
- realvec_t res;
- for (int d=0; d<size; ++d) res.set_elt(d, p[d]);
- return res;
- // realvec_t res;
- // memcpy(&res.v, p, sizeof res.v);
- // return res;
- }
- static realvec_t loadu(const real_t* p, size_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return loadu(p+ioff);
- }
- realvec_t loada(const real_t* p, const mask_t& m) const
- {
- return m.m.ifthen(loada(p), *this);
- }
- realvec_t loadu(const real_t* p, const mask_t& m) const
- {
- return m.m.ifthen(loadu(p), *this);
- }
- realvec_t loadu(const real_t* p, size_t ioff, const mask_t& m) const
- {
- return m.m.ifthen(loadu(p, ioff), *this);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
+ return mkvec(*(const vector_t *)p);
+ }
+ static realvec_t loadu(const real_t *p) {
+ // return mkvec(*(const vector_t*)p);
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.set_elt(d, p[d]);
+ return res;
+ // realvec_t res;
+ // memcpy(&res.v, p, sizeof res.v);
+ // return res;
+ }
+ static realvec_t loadu(const real_t *p, size_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(const real_t *p, const mask_t &m) const {
+ return m.m.ifthen(loada(p), *this);
+ }
+ realvec_t loadu(const real_t *p, const mask_t &m) const {
+ return m.m.ifthen(loadu(p), *this);
+ }
+ realvec_t loadu(const real_t *p, size_t ioff, const mask_t &m) const {
+ return m.m.ifthen(loadu(p, ioff), *this);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
#if __has_builtin(__builtin_assume_aligned)
- p = (real_t*)__builtin_assume_aligned(p, sizeof(realvec_t));
+ p = (real_t *)__builtin_assume_aligned(p, sizeof(realvec_t));
#endif
- *(vector_t*)p = v;
- }
- void storeu(real_t* p) const
- {
- // *(vector_t*)p = v;
- for (int d=0; d<size; ++d) p[d] = (*this)[d];
- // memcpy(p, &v, sizeof res.v);
- }
- void storeu(real_t* p, size_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storeu(p+ioff);
- }
- void storea(real_t* p, const mask_t& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storeu(p, m);
- }
- void storeu(real_t* p, const mask_t& m) const
- {
- for (int d=0; d<size; ++d) if (m.m[d]) p[d] = (*this)[d];
- }
- void storeu(real_t* p, size_t ioff, const mask_t& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storeu(p+ioff, m);
- }
-
-
-
- intvec_t as_int() const
- {
- intvec_t res;
- std::memcpy(&res.v, &v, sizeof res.v);
- return res;
- }
- intvec_t convert_int() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.set_elt(d, int_t((*this)[d]));
- return res;
- }
-
-
-
- realvec_t operator+() const { return mkvec(+v); }
- realvec_t operator-() const { return mkvec(-v); }
-
- realvec_t operator+(realvec_t x) const { return mkvec(v + x.v); }
- realvec_t operator-(realvec_t x) const { return mkvec(v - x.v); }
- realvec_t operator*(realvec_t x) const { return mkvec(v * x.v); }
- realvec_t operator/(realvec_t x) const { return mkvec(v / x.v); }
-
- realvec_t& operator+=(const realvec_t& x) { return *this=*this+x; }
- realvec_t& operator-=(const realvec_t& x) { return *this=*this-x; }
- realvec_t& operator*=(const realvec_t& x) { return *this=*this*x; }
- realvec_t& operator/=(const realvec_t& x) { return *this=*this/x; }
-
- real_t maxval() const
- {
- real_t res = v[0];
- for (int d=1; d<size; ++d) {
- res = builtin_fmax(res, (*this)[d]);
- }
- return res;
- }
- real_t minval() const
- {
- real_t res = v[0];
- for (int d=1; d<size; ++d) {
- res = builtin_fmin(res, (*this)[d]);
- }
- return res;
- }
- real_t prod() const
- {
- real_t res = (*this)[0];
- for (int d=1; d<size; ++d) res *= (*this)[d];
- return res;
- }
- real_t sum() const
- {
- real_t res = (*this)[0];
- for (int d=1; d<size; ++d) res += (*this)[d];
- return res;
- }
-
-
-
- boolvec_t operator==(const realvec_t& x) const
- {
- return boolvec_t::mkvec(v == x.v);
- }
- boolvec_t operator!=(const realvec_t& x) const
- {
- return boolvec_t::mkvec(v != x.v);
- }
- boolvec_t operator<(const realvec_t& x) const
- {
- return boolvec_t::mkvec(v < x.v);
- }
- boolvec_t operator<=(const realvec_t& x) const
- {
- return boolvec_t::mkvec(v <= x.v);
- }
- boolvec_t operator>(const realvec_t& x) const
- {
- return boolvec_t::mkvec(v > x.v);
- }
- boolvec_t operator>=(const realvec_t& x) const
- {
- return boolvec_t::mkvec(v >= x.v);
- }
-
-
-
- realvec_t acos() const { return map(builtin_acos); }
- realvec_t acosh() const { return map(builtin_acosh); }
- realvec_t asin() const { return map(builtin_asin); }
- realvec_t asinh() const { return map(builtin_asinh); }
- realvec_t atan() const { return map(builtin_atan); }
- realvec_t atan2(realvec_t y) const { return map(builtin_atan2, y); }
- realvec_t atanh() const { return map(builtin_atanh); }
- realvec_t cbrt() const { return map(builtin_cbrt); }
- realvec_t ceil() const { return map(builtin_ceil); }
- realvec_t copysign(realvec_t y) const { return map(builtin_copysign, y); }
- realvec_t cos() const { return map(builtin_cos); }
- realvec_t cosh() const { return map(builtin_cosh); }
- realvec_t exp() const { return map(builtin_exp); }
- realvec_t exp10() const { return MF::vml_exp10(*this); }
- realvec_t exp2() const { return map(builtin_exp2); }
- realvec_t expm1() const { return map(builtin_expm1); }
- realvec_t fabs() const { return map(builtin_fabs); }
- realvec_t fdim(realvec_t y) const { return map(builtin_fdim, y); }
- realvec_t floor() const { return map(builtin_floor); }
- realvec_t fma(realvec_t y, realvec_t z) const
- {
- return map(builtin_fma, y, z);
- }
- realvec_t fmax(realvec_t y) const { return map(builtin_fmax, y); }
- realvec_t fmin(realvec_t y) const { return map(builtin_fmin, y); }
- realvec_t fmod(realvec_t y) const { return map(builtin_fmod, y); }
- realvec_t frexp(intvec_t* r) const
- {
- realvec_t res;
- intvec_t exp;
- for (int d=0; d<size; ++d) {
- real_t val = (*this)[d];
- int iexp;
- res.set_elt(d, __builtin_frexp(val, &iexp));
- int_t jexp = int_t(iexp);
- if (__builtin_isinf(val)) jexp = std::numeric_limits<int_t>::max();
- if (__builtin_isnan(val)) jexp = std::numeric_limits<int_t>::min();
- exp.set_elt(d, jexp);
- }
- *r = exp;
- return res;
- }
- realvec_t hypot(realvec_t y) const { return map(builtin_hypot, y); }
- intvec_t ilogb() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) {
- real_t val = (*this)[d];
- int iexp = __builtin_ilogb(val);
- int_t jexp = int_t(iexp);
- if (val == R(0.0)) jexp = std::numeric_limits<int_t>::min();
- if (__builtin_isinf(val)) jexp = std::numeric_limits<int_t>::max();
- if (__builtin_isnan(val)) jexp = std::numeric_limits<int_t>::min();
- res.set_elt(d, jexp);
- }
- return res;
- }
- boolvec_t isfinite() const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) {
- res.set_elt(d, builtin_isfinite((*this)[d]) != 0);
- }
- return res;
- }
- boolvec_t isinf() const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) {
- res.set_elt(d, builtin_isinf((*this)[d]) != 0);
- }
- return res;
- }
- boolvec_t isnan() const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) {
- res.set_elt(d, builtin_isnan((*this)[d]) != 0);
- }
- return res;
- }
- boolvec_t isnormal() const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) {
- res.set_elt(d, builtin_isnormal((*this)[d]) != 0);
- }
- return res;
- }
- realvec_t ldexp(int_t n) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) {
- res.set_elt(d, builtin_ldexp((*this)[d], int(n)));
- }
- return res;
- }
- realvec_t ldexp(intvec_t n) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) {
- res.set_elt(d, builtin_ldexp((*this)[d], int(n[d])));
- }
- return res;
+ *(vector_t *)p = v;
+ }
+ void storeu(real_t *p) const {
+ // *(vector_t*)p = v;
+ for (int d = 0; d < size; ++d)
+ p[d] = (*this)[d];
+ // memcpy(p, &v, sizeof res.v);
+ }
+ void storeu(real_t *p, size_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, const mask_t &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storeu(p, m);
+ }
+ void storeu(real_t *p, const mask_t &m) const {
+ for (int d = 0; d < size; ++d)
+ if (m.m[d])
+ p[d] = (*this)[d];
+ }
+ void storeu(real_t *p, size_t ioff, const mask_t &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const {
+ intvec_t res;
+ std::memcpy(&res.v, &v, sizeof res.v);
+ return res;
+ }
+ intvec_t convert_int() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.set_elt(d, int_t((*this)[d]));
+ return res;
+ }
+
+ realvec_t operator+() const { return mkvec(+v); }
+ realvec_t operator-() const { return mkvec(-v); }
+
+ realvec_t operator+(realvec_t x) const { return mkvec(v + x.v); }
+ realvec_t operator-(realvec_t x) const { return mkvec(v - x.v); }
+ realvec_t operator*(realvec_t x) const { return mkvec(v * x.v); }
+ realvec_t operator/(realvec_t x) const { return mkvec(v / x.v); }
+
+ realvec_t &operator+=(const realvec_t &x) { return *this = *this + x; }
+ realvec_t &operator-=(const realvec_t &x) { return *this = *this - x; }
+ realvec_t &operator*=(const realvec_t &x) { return *this = *this * x; }
+ realvec_t &operator/=(const realvec_t &x) { return *this = *this / x; }
+
+ real_t maxval() const {
+ real_t res = v[0];
+ for (int d = 1; d < size; ++d) {
+ res = builtin_fmax(res, (*this)[d]);
}
- realvec_t log() const { return map(builtin_log); }
- realvec_t log10() const { return map(builtin_log10); }
- realvec_t log1p() const { return map(builtin_log1p); }
- realvec_t log2() const { return map(builtin_log2); }
- intvec_t lrint() const
- {
- if (sizeof(int_t) <= sizeof(long)) {
- return map(builtin_lrint);
- } else if (sizeof(int_t) <= sizeof(long long)) {
- return map(builtin_llrint);
- }
- __builtin_unreachable();
+ return res;
+ }
+ real_t minval() const {
+ real_t res = v[0];
+ for (int d = 1; d < size; ++d) {
+ res = builtin_fmin(res, (*this)[d]);
}
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return MF::vml_mad(*this, y, z);
+ return res;
+ }
+ real_t prod() const {
+ real_t res = (*this)[0];
+ for (int d = 1; d < size; ++d)
+ res *= (*this)[d];
+ return res;
+ }
+ real_t sum() const {
+ real_t res = (*this)[0];
+ for (int d = 1; d < size; ++d)
+ res += (*this)[d];
+ return res;
+ }
+
+ boolvec_t operator==(const realvec_t &x) const {
+ return boolvec_t::mkvec(v == x.v);
+ }
+ boolvec_t operator!=(const realvec_t &x) const {
+ return boolvec_t::mkvec(v != x.v);
+ }
+ boolvec_t operator<(const realvec_t &x) const {
+ return boolvec_t::mkvec(v < x.v);
+ }
+ boolvec_t operator<=(const realvec_t &x) const {
+ return boolvec_t::mkvec(v <= x.v);
+ }
+ boolvec_t operator>(const realvec_t &x) const {
+ return boolvec_t::mkvec(v > x.v);
+ }
+ boolvec_t operator>=(const realvec_t &x) const {
+ return boolvec_t::mkvec(v >= x.v);
+ }
+
+ realvec_t acos() const { return map(builtin_acos); }
+ realvec_t acosh() const { return map(builtin_acosh); }
+ realvec_t asin() const { return map(builtin_asin); }
+ realvec_t asinh() const { return map(builtin_asinh); }
+ realvec_t atan() const { return map(builtin_atan); }
+ realvec_t atan2(realvec_t y) const { return map(builtin_atan2, y); }
+ realvec_t atanh() const { return map(builtin_atanh); }
+ realvec_t cbrt() const { return map(builtin_cbrt); }
+ realvec_t ceil() const { return map(builtin_ceil); }
+ realvec_t copysign(realvec_t y) const { return map(builtin_copysign, y); }
+ realvec_t cos() const { return map(builtin_cos); }
+ realvec_t cosh() const { return map(builtin_cosh); }
+ realvec_t exp() const { return map(builtin_exp); }
+ realvec_t exp10() const { return MF::vml_exp10(*this); }
+ realvec_t exp2() const { return map(builtin_exp2); }
+ realvec_t expm1() const { return map(builtin_expm1); }
+ realvec_t fabs() const { return map(builtin_fabs); }
+ realvec_t fdim(realvec_t y) const { return map(builtin_fdim, y); }
+ realvec_t floor() const { return map(builtin_floor); }
+ realvec_t fma(realvec_t y, realvec_t z) const {
+ return map(builtin_fma, y, z);
+ }
+ realvec_t fmax(realvec_t y) const { return map(builtin_fmax, y); }
+ realvec_t fmin(realvec_t y) const { return map(builtin_fmin, y); }
+ realvec_t fmod(realvec_t y) const { return map(builtin_fmod, y); }
+ realvec_t frexp(intvec_t *r) const {
+ realvec_t res;
+ intvec_t exp;
+ for (int d = 0; d < size; ++d) {
+ real_t val = (*this)[d];
+ int iexp;
+ res.set_elt(d, __builtin_frexp(val, &iexp));
+ int_t jexp = int_t(iexp);
+ if (__builtin_isinf(val))
+ jexp = std::numeric_limits<int_t>::max();
+ if (__builtin_isnan(val))
+ jexp = std::numeric_limits<int_t>::min();
+ exp.set_elt(d, jexp);
+ }
+ *r = exp;
+ return res;
+ }
+ realvec_t hypot(realvec_t y) const { return map(builtin_hypot, y); }
+ intvec_t ilogb() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d) {
+ real_t val = (*this)[d];
+ int iexp = __builtin_ilogb(val);
+ int_t jexp = int_t(iexp);
+ if (val == R(0.0))
+ jexp = std::numeric_limits<int_t>::min();
+ if (__builtin_isinf(val))
+ jexp = std::numeric_limits<int_t>::max();
+ if (__builtin_isnan(val))
+ jexp = std::numeric_limits<int_t>::min();
+ res.set_elt(d, jexp);
}
- realvec_t nextafter(realvec_t y) const { return map(builtin_nextafter, y); }
- realvec_t pow(realvec_t y) const { return map(builtin_pow, y); }
- realvec_t rcp() const { return RV(1.0) / *this; }
- realvec_t remainder(realvec_t y) const { return map(builtin_remainder, y); }
- realvec_t rint() const { return map(builtin_rint); }
- realvec_t round() const { return map(builtin_round); }
- realvec_t rsqrt() const { return RV(1.0) / sqrt(); }
- boolvec_t signbit() const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) {
- res.set_elt(d, builtin_signbit((*this)[d]) != 0);
- }
- return res;
+ return res;
+ }
+ boolvec_t isfinite() const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d) {
+ res.set_elt(d, builtin_isfinite((*this)[d]) != 0);
}
- realvec_t sin() const { return map(builtin_sin); }
- realvec_t sinh() const { return map(builtin_sinh); }
- realvec_t sqrt() const { return map(builtin_sqrt); }
- realvec_t tan() const { return map(builtin_tan); }
- realvec_t tanh() const { return map(builtin_tanh); }
- realvec_t trunc() const { return map(builtin_trunc); }
- };
-
-
-
- // boolbuiltinvec definitions
-
- template<typename T, int N>
- inline
- typename boolbuiltinvec<T,N>::intvec_t boolbuiltinvec<T,N>::as_int() const
- {
- intvec_t res;
- std::memcpy(&res.v, &v, sizeof res.v);
return res;
}
-
- template<typename T, int N>
- inline
- typename boolbuiltinvec<T,N>::intvec_t
- boolbuiltinvec<T,N>::convert_int() const
- {
- return - as_int();
- }
-
- template<typename T, int N>
- inline
- typename boolbuiltinvec<T,N>::boolvec_t
- boolbuiltinvec<T,N>::ifthen(boolvec_t x, boolvec_t y) const
- {
- // return v ? x.v : y.v;
+ boolvec_t isinf() const {
boolvec_t res;
- for (int d=0; d<size; ++d) res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+ for (int d = 0; d < size; ++d) {
+ res.set_elt(d, builtin_isinf((*this)[d]) != 0);
+ }
return res;
}
-
- template<typename T, int N>
- inline
- typename boolbuiltinvec<T,N>::intvec_t
- boolbuiltinvec<T,N>::ifthen(intvec_t x, intvec_t y) const
- {
- // return v ? x.v : y.v;
- intvec_t res;
- for (int d=0; d<size; ++d) res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+ boolvec_t isnan() const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d) {
+ res.set_elt(d, builtin_isnan((*this)[d]) != 0);
+ }
return res;
}
-
- template<typename T, int N>
- inline
- typename boolbuiltinvec<T,N>::realvec_t
- boolbuiltinvec<T,N>::ifthen(realvec_t x, realvec_t y) const
- {
- // return v ? x.v : y.v;
- realvec_t res;
- for (int d=0; d<size; ++d) res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+ boolvec_t isnormal() const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d) {
+ res.set_elt(d, builtin_isnormal((*this)[d]) != 0);
+ }
return res;
}
-
-
-
- // intbuiltinvec definitions
-
- template<typename T, int N>
- inline
- typename intbuiltinvec<T,N>::realvec_t intbuiltinvec<T,N>::as_float() const
- {
+ realvec_t ldexp(int_t n) const {
realvec_t res;
- std::memcpy(&res.v, &v, sizeof res.v);
+ for (int d = 0; d < size; ++d) {
+ res.set_elt(d, builtin_ldexp((*this)[d], int(n)));
+ }
return res;
}
-
- template<typename T, int N>
- inline
- typename intbuiltinvec<T,N>::realvec_t
- intbuiltinvec<T,N>::convert_float() const
- {
+ realvec_t ldexp(intvec_t n) const {
realvec_t res;
- for (int d=0; d<size; ++d) res.set_elt(d, real_t((*this)[d]));
+ for (int d = 0; d < size; ++d) {
+ res.set_elt(d, builtin_ldexp((*this)[d], int(n[d])));
+ }
return res;
}
-
-
-
- // Wrappers
-
- // boolbuiltinvec wrappers
-
- template<typename real_t, int size>
- inline
- intbuiltinvec<real_t, size> as_int(boolbuiltinvec<real_t, size> x)
- {
- return x.as_int();
- }
-
- template<typename real_t, int size>
- inline
- intbuiltinvec<real_t, size> convert_int(boolbuiltinvec<real_t, size> x)
- {
- return x.convert_int();
- }
-
- template<typename real_t, int size>
- inline bool all(boolbuiltinvec<real_t, size> x) { return x.all(); }
-
- template<typename real_t, int size>
- inline bool any(boolbuiltinvec<real_t, size> x) { return x.any(); }
-
- template<typename real_t, int size>
- inline
- boolbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
- boolbuiltinvec<real_t, size> x,
- boolbuiltinvec<real_t, size> y)
- {
- return c.ifthen(x, y);
- }
-
- template<typename real_t, int size>
- inline
- intbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
- intbuiltinvec<real_t, size> x,
- intbuiltinvec<real_t, size> y)
- {
- return c.ifthen(x, y);
- }
-
- template<typename real_t, int size>
- inline
- realbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
- realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y)
- {
- return c.ifthen(x, y);
- }
-
-
-
- // intbuiltinvec wrappers
-
- template<typename real_t, int size>
- inline intbuiltinvec<real_t, size> abs(intbuiltinvec<real_t, size> x)
- {
- return x.abs();
- }
-
- template<typename real_t, int size>
- inline boolbuiltinvec<real_t, size> as_bool(intbuiltinvec<real_t, size> x)
- {
- return x.as_bool();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> as_float(intbuiltinvec<real_t, size> x)
- {
- return x.as_float();
- }
-
- template<typename real_t, int size>
- inline
- intbuiltinvec<real_t, size> bitifthen(intbuiltinvec<real_t, size> x,
- intbuiltinvec<real_t, size> y,
- intbuiltinvec<real_t, size> z)
- {
- return x.bitifthen(y, z);
- }
-
- template<typename real_t, int size>
- inline intbuiltinvec<real_t, size> clz(intbuiltinvec<real_t, size> x)
- {
- return x.clz();
- }
-
- template<typename real_t, int size>
- inline boolbuiltinvec<real_t, size> convert_bool(intbuiltinvec<real_t, size> x)
- {
- return x.convert_bool();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> convert_float(intbuiltinvec<real_t, size> x)
- {
- return x.convert_float();
- }
-
- template<typename real_t, int size>
- inline boolbuiltinvec<real_t, size> isignbit(intbuiltinvec<real_t, size> x)
- {
- return x.isignbit();
- }
-
- template<typename real_t, int size>
- inline
- intbuiltinvec<real_t, size> lsr(intbuiltinvec<real_t, size> x,
- typename intbuiltinvec<real_t, size>::int_t n)
- {
- return x.lsr(n);
- }
-
- template<typename real_t, int size>
- inline
- intbuiltinvec<real_t, size> lsr(intbuiltinvec<real_t, size> x,
- intbuiltinvec<real_t, size> n)
- {
- return x.lsr(n);
- }
-
- template<typename real_t, int size>
- inline
- intbuiltinvec<real_t, size> max(intbuiltinvec<real_t, size> x,
- intbuiltinvec<real_t, size> y)
- {
- return x.max(y);
- }
-
- template<typename real_t, int size>
- inline
- intbuiltinvec<real_t, size> min(intbuiltinvec<real_t, size> x,
- intbuiltinvec<real_t, size> y)
- {
- return x.min(y);
- }
-
- template<typename real_t, int size>
- inline
- intbuiltinvec<real_t, size> popcount(intbuiltinvec<real_t, size> x)
- {
- return x.popcount();
- }
-
- template<typename real_t, int size>
- inline
- intbuiltinvec<real_t, size>
- rotate(intbuiltinvec<real_t, size> x,
- typename intbuiltinvec<real_t, size>::int_t n)
- {
- return x.rotate(n);
- }
-
- template<typename real_t, int size>
- inline
- intbuiltinvec<real_t, size> rotate(intbuiltinvec<real_t, size> x,
- intbuiltinvec<real_t, size> n)
- {
- return x.rotate(n);
- }
-
-
-
- // realbuiltinvec wrappers
-
- template<typename real_t, int size>
- inline
- realbuiltinvec<real_t, size>
- loada(real_t const* p,
- realbuiltinvec<real_t, size> x,
- typename realbuiltinvec<real_t, size>::mask_t const& m)
- {
- return x.loada(p, m);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size>
- loadu(real_t const* p,
- realbuiltinvec<real_t, size> x,
- typename realbuiltinvec<real_t, size>::mask_t const& m)
- {
- return x.loadu(p, m);
- }
-
- template<typename real_t, int size>
- inline
- realbuiltinvec<real_t, size>
- loadu(real_t const* p, size_t ioff,
- realbuiltinvec<real_t, size> x,
- typename realbuiltinvec<real_t, size>::mask_t const& m)
- {
- return x.loadu(p, ioff, m);
- }
-
- template<typename real_t, int size>
- inline void storea(realbuiltinvec<real_t, size> x, real_t* p)
- {
- return x.storea(p);
- }
-
- template<typename real_t, int size>
- inline void storeu(realbuiltinvec<real_t, size> x, real_t* p)
- {
- return x.storeu(p);
- }
-
- template<typename real_t, int size>
- inline void storeu(realbuiltinvec<real_t, size> x, real_t* p, size_t ioff)
- {
- return x.storeu(p, ioff);
- }
-
- template<typename real_t, int size>
- inline void storea(realbuiltinvec<real_t, size> x, real_t* p,
- typename realbuiltinvec<real_t, size>::mask_t const& m)
- {
- return x.storea(p, m);
- }
-
- template<typename real_t, int size>
- inline void storeu(realbuiltinvec<real_t, size> x, real_t* p,
- typename realbuiltinvec<real_t, size>::mask_t const& m)
- {
- return x.storeu(p, m);
- }
-
- template<typename real_t, int size>
- inline void storeu(realbuiltinvec<real_t, size> x, real_t* p, size_t ioff,
- typename realbuiltinvec<real_t, size>::mask_t const& m)
- {
- return x.storeu(p, ioff, m);
- }
-
-
-
- template<typename real_t, int size>
- inline intbuiltinvec<real_t, size> as_int(realbuiltinvec<real_t, size> x)
- {
- return x.as_int();
- }
-
- template<typename real_t, int size>
- inline intbuiltinvec<real_t, size> convert_int(realbuiltinvec<real_t, size> x)
- {
- return x.convert_int();
- }
-
- template<typename real_t, int size>
- inline real_t maxval(realbuiltinvec<real_t, size> x)
- {
- return x.maxval();
- }
-
- template<typename real_t, int size>
- inline real_t minval(realbuiltinvec<real_t, size> x)
- {
- return x.minval();
- }
-
- template<typename real_t, int size>
- inline real_t prod(realbuiltinvec<real_t, size> x)
- {
- return x.prod();
- }
-
- template<typename real_t, int size>
- inline real_t sum(realbuiltinvec<real_t, size> x)
- {
- return x.sum();
- }
-
-
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> acos(realbuiltinvec<real_t, size> x)
- {
- return x.acos();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> acosh(realbuiltinvec<real_t, size> x)
- {
- return x.acosh();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> asin(realbuiltinvec<real_t, size> x)
- {
- return x.asin();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> asinh(realbuiltinvec<real_t, size> x)
- {
- return x.asinh();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> atan(realbuiltinvec<real_t, size> x)
- {
- return x.atan();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> atan2(realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y)
- {
- return x.atan2(y);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> atanh(realbuiltinvec<real_t, size> x)
- {
- return x.atanh();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> cbrt(realbuiltinvec<real_t, size> x)
- {
- return x.cbrt();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> ceil(realbuiltinvec<real_t, size> x)
- {
- return x.ceil();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> copysign(realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y)
- {
- return x.copysign(y);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> cos(realbuiltinvec<real_t, size> x)
- {
- return x.cos();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> cosh(realbuiltinvec<real_t, size> x)
- {
- return x.cosh();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> exp(realbuiltinvec<real_t, size> x)
- {
- return x.exp();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> exp10(realbuiltinvec<real_t, size> x)
- {
- return x.exp10();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> exp2(realbuiltinvec<real_t, size> x)
- {
- return x.exp2();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> expm1(realbuiltinvec<real_t, size> x)
- {
- return x.expm1();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> fabs(realbuiltinvec<real_t, size> x)
- {
- return x.fabs();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> floor(realbuiltinvec<real_t, size> x)
- {
- return x.floor();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> fdim(realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y)
- {
- return x.fdim(y);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> fma(realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y,
- realbuiltinvec<real_t, size> z)
- {
- return x.fma(y, z);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> fmax(realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y)
- {
- return x.fmax(y);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> fmin(realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y)
- {
- return x.fmin(y);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> fmod(realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y)
- {
- return x.fmod(y);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> frexp(realbuiltinvec<real_t, size> x,
- intbuiltinvec<real_t, size>* r)
- {
- return x.frexp(r);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> hypot(realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y)
- {
- return x.hypot(y);
- }
-
- template<typename real_t, int size>
- inline intbuiltinvec<real_t, size> ilogb(realbuiltinvec<real_t, size> x)
- {
- return x.ilogb();
- }
-
- template<typename real_t, int size>
- inline boolbuiltinvec<real_t, size> isfinite(realbuiltinvec<real_t, size> x)
- {
- return x.isfinite();
- }
-
- template<typename real_t, int size>
- inline boolbuiltinvec<real_t, size> isinf(realbuiltinvec<real_t, size> x)
- {
- return x.isinf();
- }
-
- template<typename real_t, int size>
- inline boolbuiltinvec<real_t, size> isnan(realbuiltinvec<real_t, size> x)
- {
- return x.isnan();
- }
-
- template<typename real_t, int size>
- inline boolbuiltinvec<real_t, size> isnormal(realbuiltinvec<real_t, size> x)
- {
- return x.isnormal();
- }
-
- template<typename real_t, int size>
- inline
- realbuiltinvec<real_t, size>
- ldexp(realbuiltinvec<real_t, size> x,
- typename intbuiltinvec<real_t, size>::int_t n)
- {
- return x.ldexp(n);
- }
-
- template<typename real_t, int size>
- inline
- realbuiltinvec<real_t, size> ldexp(realbuiltinvec<real_t, size> x,
- intbuiltinvec<real_t, size> n)
- {
- return x.ldexp(n);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> log(realbuiltinvec<real_t, size> x)
- {
- return x.log();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> log10(realbuiltinvec<real_t, size> x)
- {
- return x.log10();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> log1p(realbuiltinvec<real_t, size> x)
- {
- return x.log1p();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> log2(realbuiltinvec<real_t, size> x)
- {
- return x.log2();
- }
-
- template<typename real_t, int size>
- inline intbuiltinvec<real_t, size> lrint(realbuiltinvec<real_t, size> x)
- {
- return x.lrint();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> mad(realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y,
- realbuiltinvec<real_t, size> z)
- {
- return x.mad(y, z);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> nextafter(realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y)
- {
- return x.nextafter(y);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> pow(realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y)
- {
- return x.pow(y);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> rcp(realbuiltinvec<real_t, size> x)
- {
- return x.rcp();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> remainder(realbuiltinvec<real_t, size> x,
- realbuiltinvec<real_t, size> y)
- {
- return x.remainder(y);
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> rint(realbuiltinvec<real_t, size> x)
- {
- return x.rint();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> round(realbuiltinvec<real_t, size> x)
- {
- return x.round();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> rsqrt(realbuiltinvec<real_t, size> x)
- {
- return x.rsqrt();
- }
-
- template<typename real_t, int size>
- inline boolbuiltinvec<real_t, size> signbit(realbuiltinvec<real_t, size> x)
- {
- return x.signbit();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> sin(realbuiltinvec<real_t, size> x)
- {
- return x.sin();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> sinh(realbuiltinvec<real_t, size> x)
- {
- return x.sinh();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> sqrt(realbuiltinvec<real_t, size> x)
- {
- return x.sqrt();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> tan(realbuiltinvec<real_t, size> x)
- {
- return x.tan();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> tanh(realbuiltinvec<real_t, size> x)
- {
- return x.tanh();
- }
-
- template<typename real_t, int size>
- inline realbuiltinvec<real_t, size> trunc(realbuiltinvec<real_t, size> x)
- {
- return x.trunc();
- }
-
-
-
-#ifndef VML_NO_IOSTREAM
- template<typename real_t, int size>
- std::ostream& operator<<(std::ostream& os,
- boolbuiltinvec<real_t, size> const& x)
- {
- os << "[";
- for (int i=0; i<size; ++i) {
- if (i!=0) os << ",";
- os << x[i];
- }
- os << "]";
- return os;
- }
-
- template<typename real_t, int size>
- std::ostream& operator<<(std::ostream& os,
- intbuiltinvec<real_t, size> const& x)
- {
- os << "[";
- for (int i=0; i<size; ++i) {
- if (i!=0) os << ",";
- os << x[i];
- }
- os << "]";
- return os;
- }
-
- template<typename real_t, int size>
- std::ostream& operator<<(std::ostream& os,
- realbuiltinvec<real_t, size> const& x)
- {
- os << "[";
- for (int i=0; i<size; ++i) {
- if (i!=0) os << ",";
- os << x[i];
+ realvec_t log() const { return map(builtin_log); }
+ realvec_t log10() const { return map(builtin_log10); }
+ realvec_t log1p() const { return map(builtin_log1p); }
+ realvec_t log2() const { return map(builtin_log2); }
+ intvec_t lrint() const {
+ if (sizeof(int_t) <= sizeof(long)) {
+ return map(builtin_lrint);
+ } else if (sizeof(int_t) <= sizeof(long long)) {
+ return map(builtin_llrint);
+ }
+ __builtin_unreachable();
+ }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return MF::vml_mad(*this, y, z);
+ }
+ realvec_t nextafter(realvec_t y) const { return map(builtin_nextafter, y); }
+ realvec_t pow(realvec_t y) const { return map(builtin_pow, y); }
+ realvec_t rcp() const { return RV(1.0) / *this; }
+ realvec_t remainder(realvec_t y) const { return map(builtin_remainder, y); }
+ realvec_t rint() const { return map(builtin_rint); }
+ realvec_t round() const { return map(builtin_round); }
+ realvec_t rsqrt() const { return RV(1.0) / sqrt(); }
+ boolvec_t signbit() const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d) {
+ res.set_elt(d, builtin_signbit((*this)[d]) != 0);
}
- os << "]";
- return os;
+ return res;
}
+ realvec_t sin() const { return map(builtin_sin); }
+ realvec_t sinh() const { return map(builtin_sinh); }
+ realvec_t sqrt() const { return map(builtin_sqrt); }
+ realvec_t tan() const { return map(builtin_tan); }
+ realvec_t tanh() const { return map(builtin_tanh); }
+ realvec_t trunc() const { return map(builtin_trunc); }
+};
+
+// boolbuiltinvec definitions
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::intvec_t
+boolbuiltinvec<T, N>::as_int() const {
+ intvec_t res;
+ std::memcpy(&res.v, &v, sizeof res.v);
+ return res;
+}
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::intvec_t
+boolbuiltinvec<T, N>::convert_int() const {
+ return -as_int();
+}
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::boolvec_t
+boolbuiltinvec<T, N>::ifthen(boolvec_t x, boolvec_t y) const {
+ // return v ? x.v : y.v;
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+ return res;
+}
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::intvec_t
+boolbuiltinvec<T, N>::ifthen(intvec_t x, intvec_t y) const {
+ // return v ? x.v : y.v;
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+ return res;
+}
+
+template <typename T, int N>
+inline typename boolbuiltinvec<T, N>::realvec_t
+boolbuiltinvec<T, N>::ifthen(realvec_t x, realvec_t y) const {
+ // return v ? x.v : y.v;
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.set_elt(d, (*this)[d] ? x[d] : y[d]);
+ return res;
+}
+
+// intbuiltinvec definitions
+
+template <typename T, int N>
+inline typename intbuiltinvec<T, N>::realvec_t
+intbuiltinvec<T, N>::as_float() const {
+ realvec_t res;
+ std::memcpy(&res.v, &v, sizeof res.v);
+ return res;
+}
+
+template <typename T, int N>
+inline typename intbuiltinvec<T, N>::realvec_t
+intbuiltinvec<T, N>::convert_float() const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.set_elt(d, real_t((*this)[d]));
+ return res;
+}
+
+// Wrappers
+
+// boolbuiltinvec wrappers
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> as_int(boolbuiltinvec<real_t, size> x) {
+ return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> convert_int(boolbuiltinvec<real_t, size> x) {
+ return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline bool all(boolbuiltinvec<real_t, size> x) {
+ return x.all();
+}
+
+template <typename real_t, int size>
+inline bool any(boolbuiltinvec<real_t, size> x) {
+ return x.any();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
+ boolbuiltinvec<real_t, size> x,
+ boolbuiltinvec<real_t, size> y) {
+ return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
+ intbuiltinvec<real_t, size> x,
+ intbuiltinvec<real_t, size> y) {
+ return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c,
+ realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y) {
+ return c.ifthen(x, y);
+}
+
+// intbuiltinvec wrappers
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> abs(intbuiltinvec<real_t, size> x) {
+ return x.abs();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> as_bool(intbuiltinvec<real_t, size> x) {
+ return x.as_bool();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> as_float(intbuiltinvec<real_t, size> x) {
+ return x.as_float();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> bitifthen(intbuiltinvec<real_t, size> x,
+ intbuiltinvec<real_t, size> y,
+ intbuiltinvec<real_t, size> z) {
+ return x.bitifthen(y, z);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> clz(intbuiltinvec<real_t, size> x) {
+ return x.clz();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size>
+convert_bool(intbuiltinvec<real_t, size> x) {
+ return x.convert_bool();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+convert_float(intbuiltinvec<real_t, size> x) {
+ return x.convert_float();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isignbit(intbuiltinvec<real_t, size> x) {
+ return x.isignbit();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size>
+lsr(intbuiltinvec<real_t, size> x,
+ typename intbuiltinvec<real_t, size>::int_t n) {
+ return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> lsr(intbuiltinvec<real_t, size> x,
+ intbuiltinvec<real_t, size> n) {
+ return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> max(intbuiltinvec<real_t, size> x,
+ intbuiltinvec<real_t, size> y) {
+ return x.max(y);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> min(intbuiltinvec<real_t, size> x,
+ intbuiltinvec<real_t, size> y) {
+ return x.min(y);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> popcount(intbuiltinvec<real_t, size> x) {
+ return x.popcount();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size>
+rotate(intbuiltinvec<real_t, size> x,
+ typename intbuiltinvec<real_t, size>::int_t n) {
+ return x.rotate(n);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> rotate(intbuiltinvec<real_t, size> x,
+ intbuiltinvec<real_t, size> n) {
+ return x.rotate(n);
+}
+
+// realbuiltinvec wrappers
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+loada(real_t const *p, realbuiltinvec<real_t, size> x,
+ typename realbuiltinvec<real_t, size>::mask_t const &m) {
+ return x.loada(p, m);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+loadu(real_t const *p, realbuiltinvec<real_t, size> x,
+ typename realbuiltinvec<real_t, size>::mask_t const &m) {
+ return x.loadu(p, m);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+loadu(real_t const *p, size_t ioff, realbuiltinvec<real_t, size> x,
+ typename realbuiltinvec<real_t, size>::mask_t const &m) {
+ return x.loadu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline void storea(realbuiltinvec<real_t, size> x, real_t *p) {
+ return x.storea(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realbuiltinvec<real_t, size> x, real_t *p) {
+ return x.storeu(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realbuiltinvec<real_t, size> x, real_t *p, size_t ioff) {
+ return x.storeu(p, ioff);
+}
+
+template <typename real_t, int size>
+inline void storea(realbuiltinvec<real_t, size> x, real_t *p,
+ typename realbuiltinvec<real_t, size>::mask_t const &m) {
+ return x.storea(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realbuiltinvec<real_t, size> x, real_t *p,
+ typename realbuiltinvec<real_t, size>::mask_t const &m) {
+ return x.storeu(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realbuiltinvec<real_t, size> x, real_t *p, size_t ioff,
+ typename realbuiltinvec<real_t, size>::mask_t const &m) {
+ return x.storeu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> as_int(realbuiltinvec<real_t, size> x) {
+ return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> convert_int(realbuiltinvec<real_t, size> x) {
+ return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline real_t maxval(realbuiltinvec<real_t, size> x) {
+ return x.maxval();
+}
+
+template <typename real_t, int size>
+inline real_t minval(realbuiltinvec<real_t, size> x) {
+ return x.minval();
+}
+
+template <typename real_t, int size>
+inline real_t prod(realbuiltinvec<real_t, size> x) {
+ return x.prod();
+}
+
+template <typename real_t, int size>
+inline real_t sum(realbuiltinvec<real_t, size> x) {
+ return x.sum();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> acos(realbuiltinvec<real_t, size> x) {
+ return x.acos();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> acosh(realbuiltinvec<real_t, size> x) {
+ return x.acosh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> asin(realbuiltinvec<real_t, size> x) {
+ return x.asin();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> asinh(realbuiltinvec<real_t, size> x) {
+ return x.asinh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> atan(realbuiltinvec<real_t, size> x) {
+ return x.atan();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> atan2(realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y) {
+ return x.atan2(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> atanh(realbuiltinvec<real_t, size> x) {
+ return x.atanh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> cbrt(realbuiltinvec<real_t, size> x) {
+ return x.cbrt();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> ceil(realbuiltinvec<real_t, size> x) {
+ return x.ceil();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> copysign(realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y) {
+ return x.copysign(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> cos(realbuiltinvec<real_t, size> x) {
+ return x.cos();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> cosh(realbuiltinvec<real_t, size> x) {
+ return x.cosh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> exp(realbuiltinvec<real_t, size> x) {
+ return x.exp();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> exp10(realbuiltinvec<real_t, size> x) {
+ return x.exp10();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> exp2(realbuiltinvec<real_t, size> x) {
+ return x.exp2();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> expm1(realbuiltinvec<real_t, size> x) {
+ return x.expm1();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fabs(realbuiltinvec<real_t, size> x) {
+ return x.fabs();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> floor(realbuiltinvec<real_t, size> x) {
+ return x.floor();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fdim(realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y) {
+ return x.fdim(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fma(realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y,
+ realbuiltinvec<real_t, size> z) {
+ return x.fma(y, z);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fmax(realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y) {
+ return x.fmax(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fmin(realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y) {
+ return x.fmin(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> fmod(realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y) {
+ return x.fmod(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> frexp(realbuiltinvec<real_t, size> x,
+ intbuiltinvec<real_t, size> *r) {
+ return x.frexp(r);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> hypot(realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y) {
+ return x.hypot(y);
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> ilogb(realbuiltinvec<real_t, size> x) {
+ return x.ilogb();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isfinite(realbuiltinvec<real_t, size> x) {
+ return x.isfinite();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isinf(realbuiltinvec<real_t, size> x) {
+ return x.isinf();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isnan(realbuiltinvec<real_t, size> x) {
+ return x.isnan();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> isnormal(realbuiltinvec<real_t, size> x) {
+ return x.isnormal();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size>
+ldexp(realbuiltinvec<real_t, size> x,
+ typename intbuiltinvec<real_t, size>::int_t n) {
+ return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> ldexp(realbuiltinvec<real_t, size> x,
+ intbuiltinvec<real_t, size> n) {
+ return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> log(realbuiltinvec<real_t, size> x) {
+ return x.log();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> log10(realbuiltinvec<real_t, size> x) {
+ return x.log10();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> log1p(realbuiltinvec<real_t, size> x) {
+ return x.log1p();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> log2(realbuiltinvec<real_t, size> x) {
+ return x.log2();
+}
+
+template <typename real_t, int size>
+inline intbuiltinvec<real_t, size> lrint(realbuiltinvec<real_t, size> x) {
+ return x.lrint();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> mad(realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y,
+ realbuiltinvec<real_t, size> z) {
+ return x.mad(y, z);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> nextafter(realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y) {
+ return x.nextafter(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> pow(realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y) {
+ return x.pow(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> rcp(realbuiltinvec<real_t, size> x) {
+ return x.rcp();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> remainder(realbuiltinvec<real_t, size> x,
+ realbuiltinvec<real_t, size> y) {
+ return x.remainder(y);
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> rint(realbuiltinvec<real_t, size> x) {
+ return x.rint();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> round(realbuiltinvec<real_t, size> x) {
+ return x.round();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> rsqrt(realbuiltinvec<real_t, size> x) {
+ return x.rsqrt();
+}
+
+template <typename real_t, int size>
+inline boolbuiltinvec<real_t, size> signbit(realbuiltinvec<real_t, size> x) {
+ return x.signbit();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> sin(realbuiltinvec<real_t, size> x) {
+ return x.sin();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> sinh(realbuiltinvec<real_t, size> x) {
+ return x.sinh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> sqrt(realbuiltinvec<real_t, size> x) {
+ return x.sqrt();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> tan(realbuiltinvec<real_t, size> x) {
+ return x.tan();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> tanh(realbuiltinvec<real_t, size> x) {
+ return x.tanh();
+}
+
+template <typename real_t, int size>
+inline realbuiltinvec<real_t, size> trunc(realbuiltinvec<real_t, size> x) {
+ return x.trunc();
+}
+
+#ifndef VML_NO_IOSTREAM
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+ boolbuiltinvec<real_t, size> const &x) {
+ os << "[";
+ for (int i = 0; i < size; ++i) {
+ if (i != 0)
+ os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+ intbuiltinvec<real_t, size> const &x) {
+ os << "[";
+ for (int i = 0; i < size; ++i) {
+ if (i != 0)
+ os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+ realbuiltinvec<real_t, size> const &x) {
+ os << "[";
+ for (int i = 0; i < size; ++i) {
+ if (i != 0)
+ os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+}
#endif
-
+
} // namespace vecmathlib
-#endif // #ifndef VEC_BUILTIN_H
+#endif // #ifndef VEC_BUILTIN_H
diff --git a/vec_mask.h b/vec_mask.h
index 6f8c996..053e43a 100644
--- a/vec_mask.h
+++ b/vec_mask.h
@@ -5,74 +5,67 @@
#include <cstdlib>
+namespace vecmathlib {
+template <typename realvec_t> class mask_t {
-namespace vecmathlib {
-
- template<typename realvec_t>
- class mask_t {
-
- typedef typename realvec_t::boolvec_t boolvec_t;
- typedef typename realvec_t::intvec_t intvec_t;
- static const int size = realvec_t::size;
-
- public:
- std::ptrdiff_t imin, imax;
- std::ptrdiff_t i;
- boolvec_t m;
- bool all_m;
-
- public:
-
- // Construct a mask from a boolvec
- mask_t(boolvec_t m_): m(m_), all_m(all(m)) {}
-
- // Construct a mask for a particular location i
- mask_t(std::ptrdiff_t i_,
- std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff):
- imin(imin_), imax(imax_), i(i_)
- {
- all_m = i-imin >= 0 && i+size-1-imax < 0;
- if (__builtin_expect(all_m, true)) {
- m = true;
- } else {
- m = (! isignbit(intvec_t(i - imin) + intvec_t::iota()) &&
- isignbit(intvec_t(i + size-1 - imax) + intvec_t::iota()));
- }
+ typedef typename realvec_t::boolvec_t boolvec_t;
+ typedef typename realvec_t::intvec_t intvec_t;
+ static const int size = realvec_t::size;
+
+public:
+ std::ptrdiff_t imin, imax;
+ std::ptrdiff_t i;
+ boolvec_t m;
+ bool all_m;
+
+public:
+ // Construct a mask from a boolvec
+ mask_t(boolvec_t m_) : m(m_), all_m(all(m)) {}
+
+ // Construct a mask for a particular location i
+ mask_t(std::ptrdiff_t i_, std::ptrdiff_t imin_, std::ptrdiff_t imax_,
+ std::ptrdiff_t ioff)
+ : imin(imin_), imax(imax_), i(i_) {
+ all_m = i - imin >= 0 && i + size - 1 - imax < 0;
+ if (__builtin_expect(all_m, true)) {
+ m = true;
+ } else {
+ m = (!isignbit(intvec_t(i - imin) + intvec_t::iota()) &&
+ isignbit(intvec_t(i + size - 1 - imax) + intvec_t::iota()));
}
-
- // Construct a mask for a loop starting at imin, aligned down
- mask_t(std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff):
- imin(imin_), imax(imax_), i(imin_ - (ioff + imin_) % size)
- {
- all_m = i-imin >= 0 && i+size-1-imax < 0;
- if (__builtin_expect(all_m, true)) {
- m = true;
- } else {
- m = (! isignbit(intvec_t(i - imin) + intvec_t::iota()) &&
- isignbit(intvec_t(i + size-1 - imax) + intvec_t::iota()));
- }
+ }
+
+ // Construct a mask for a loop starting at imin, aligned down
+ mask_t(std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff)
+ : imin(imin_), imax(imax_), i(imin_ - (ioff + imin_) % size) {
+ all_m = i - imin >= 0 && i + size - 1 - imax < 0;
+ if (__builtin_expect(all_m, true)) {
+ m = true;
+ } else {
+ m = (!isignbit(intvec_t(i - imin) + intvec_t::iota()) &&
+ isignbit(intvec_t(i + size - 1 - imax) + intvec_t::iota()));
}
-
- // Get current index
- std::ptrdiff_t index() const { return i; }
-
- // Looping condition
- operator bool() const { return i<imax; }
-
- // Loop stepper
- void operator++()
- {
- i += size;
- all_m = i + size-1 - imax < 0;
- if (__builtin_expect(all_m, true)) {
- m = true;
- } else {
- m = isignbit(intvec_t(i + size-1 - imax) + intvec_t::iota());
- }
+ }
+
+ // Get current index
+ std::ptrdiff_t index() const { return i; }
+
+ // Looping condition
+ operator bool() const { return i < imax; }
+
+ // Loop stepper
+ void operator++() {
+ i += size;
+ all_m = i + size - 1 - imax < 0;
+ if (__builtin_expect(all_m, true)) {
+ m = true;
+ } else {
+ m = isignbit(intvec_t(i + size - 1 - imax) + intvec_t::iota());
}
- };
-
+ }
+};
+
} // namespace vecmathlib
-#endif // #ifndef VEC_MASK_H
+#endif // #ifndef VEC_MASK_H
diff --git a/vec_mic_double8.h b/vec_mic_double8.h
index 68dd5aa..ef22088 100644
--- a/vec_mic_double8.h
+++ b/vec_mic_double8.h
@@ -12,697 +12,585 @@
// MIC intrinsics
#include <immintrin.h>
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_DOUBLE_8
- template<> struct boolvec<double,8>;
- template<> struct intvec<double,8>;
- template<> struct realvec<double,8>;
-
-
-
- template<>
- struct boolvec<double,8>: floatprops<double>
- {
- static const int size = 8;
- typedef bool scalar_t;
- typedef __mask8 bvector_t;
- static const int alignment = sizeof(bvector_t);
-
- // static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- // "vector size is wrong");
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(const boolvec& x): v(x.v) {}
- // boolvec& operator=(const boolvec& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a): v(- bvector_t(a)) {}
- boolvec(const bool* as):
- v((bvector_t(as[0]) << 0) |
- (bvector_t(as[1]) << 1) |
- (bvector_t(as[2]) << 2) |
- (bvector_t(as[3]) << 3) |
- (bvector_t(as[4]) << 4) |
- (bvector_t(as[5]) << 5) |
- (bvector_t(as[6]) << 6) |
- (bvector_t(as[7]) << 7))
- {}
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const
- {
- return (v >> n) & 1;
- }
- boolvec& set_elt(int n, bool a)
- {
- v &= ~ (bvector_t(1) << n);
- v |= bvector_t(a) << n;
- return *this;
- }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec operator!() const { return _mm512_knot(v); }
-
- boolvec operator&&(boolvec x) const { return _mm512_kand(v, x.v); }
- boolvec operator||(boolvec x) const { return _mm512_kor(v, x.v); }
- boolvec operator==(boolvec x) const { return _mm512_kxnor(v, x.v); }
- boolvec operator!=(boolvec x) const { return _mm512_kxor(v, x.v); }
-
- bool all() const { return _mm512_kortestc(v, v); }
- bool any() const { return ! bool(_mm512_kortestz(v, v)); }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<double,8>: floatprops<double>
- {
- static const int size = 8;
- typedef int_t scalar_t;
- typedef __m512i ivector_t;
- static const int alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(const intvec& x): v(x.v) {}
- // intvec& operator=(const intvec& x) { return v=x.v, *this; }
- intvec(ivector_t x): v(x) {}
- intvec(int_t a): v(_mm512_set1_epi64(a)) {}
- intvec(const int_t* as)
- {
- v = _mm512_undefined_epi32();
- // v = _mm512_loadunpacklo_epi32(v, as);
- // v = _mm512_loadunpackhi_epi32(v, as+8);
- for (int n=0; n<size; ++n) set_elt(n, as[n]);
- }
- static intvec iota()
- {
- intvec r;
- for (int n=0; n<size; ++n) r.set_elt(n, n);
- return r;
- }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const
- {
- return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
- }
- intvec_t& set_elt(int n, int_t a)
- {
- return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
- }
-
-
-
- private:
- static __mmask8 mask16tomask8(__mmask16 m16)
- {
- // combine 01
- m16 = ((m16 >> 1) | m16) & 0b0011001100110011;
- // combine 0123
- m16 = ((m16 >> 2) | m16) & 0b0000111100001111;
- // combine 01234567
- m16 = ((m16 >> 4) | m16) & 0b0000000011111111;
- return m16;
- }
- public:
- boolvec_t as_bool() const { return convert_bool(); }
- boolvec_t convert_bool() const
- {
- // Result: convert_bool(0)=false, convert_bool(else)=true
- __mmask16 r16 = _mm512_test_epi32_mask(v, v);
- return mask16tomask8(r16);
- }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- // Note: not all arithmetic operations are supported!
-
- intvec operator+() const { return *this; }
- intvec operator-() const { return IV(I(0)) - *this; }
- intvec operator+(intvec x) const { return _mm512_add_epi64(v, x.v); }
- intvec operator-(intvec x) const { return _mm512_sub_epi64(v, x.v); }
-
- intvec& operator+=(const intvec& x) { return *this=*this+x; }
- intvec& operator-=(const intvec& x) { return *this=*this-x; }
-
-
-
- intvec operator~() const { return IV(~U(0)) ^ *this; }
- intvec operator&(intvec x) const { return _mm512_and_epi64(v, x.v); }
- intvec operator|(intvec x) const { return _mm512_or_epi64(v, x.v); }
- intvec operator^(intvec x) const { return _mm512_xor_epi64(v, x.v); }
-
- intvec& operator&=(const intvec& x) { return *this=*this&x; }
- intvec& operator|=(const intvec& x) { return *this=*this|x; }
- intvec& operator^=(const intvec& x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const;
-
-
-
- intvec lsr(int_t n) const
- {
- if (n < 32) {
- __m512i vlo = _mm512_srli_epi32(v, n);
- __m512i vhi = _mm512_slli_epi32(v, 32-n);
- vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB);
- return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo);
- } else {
- __m512i vlo = _mm512_srli_epi32(v, n-32);
- __m512i vhi = _mm512_setzero_epi32();
- return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo);
- }
- }
- intvec_t rotate(int_t n) const;
- intvec operator>>(int_t n) const
- {
- if (n < 32) {
- __mm512i vlo = _mm512_srai_epi32(v, n);
- __mm512i vlo0 = _mm512_srli_epi32(v, n);
- __mm512i vhi = _mm512_slli_epi32(v, 32-n);
- vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB);
- return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo0);
- } else {
- __m512i vlo = _mm512_srai_epi32(v, n-32);
- __m512i vhi = _mm512_srai_epi32(v, 31);
- return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo);
- }
- }
- intvec operator<<(int_t n) const
- {
- if (n < 32) {
- __m512i vlo = _mm512_srli_epi32(v, n);
- __m512i vhi = _mm512_slli_epi32(v, 32-n);
- vlo = _mm512_swizzle_epi32(vlo, _MM_SWIZ_REG_CDAB);
- return _mm512_mask_or_epi32(vhi, 0xb1010101010101010, vhi, vlo);
- } else {
- __m512i vlo = _mm512_setzero_epi32();
- __m512i vhi = _mm512_slli_epi32(v, n-32);
- return _mm512_mask_swizzle_epi32(vhi, 0xb1010101010101010, vlo);
- }
- }
- intvec& operator>>=(int_t n) { return *this=*this>>n; }
- intvec& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec lsr(intvec n) const
- {
- // TODO: improve this
- intvec r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, U((*this)[i]) >> U(n[i]));
- }
- return r;
- }
- intvec_t rotate(intvec_t n) const;
- intvec operator>>(intvec n) const
- {
- // TODO: improve this
- intvec r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] >> n[i]);
- }
- return r;
- }
- intvec operator<<(intvec n) const
- {
- // TODO: improve this
- intvec r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] << n[i]);
- }
- return r;
- }
- intvec& operator>>=(intvec n) { return *this=*this>>n; }
- intvec& operator<<=(intvec n) { return *this=*this<<n; }
-
- intvec_t clz() const
- {
- // Return 8*sizeof(TYPE) when the input is 0
- intvec_t r;
- for (int i=0; i<size; ++i) {
- // __lzcnt64
- r.set_elt(i, __builtin_clzll((*this)[i]));
- }
- return r;
- }
- intvec_t popcount() const
- {
- intvec_t r;
- for (int i=0; i<size; ++i) {
- // _mm_popcnt_u64
- r.set_elt(i, __builtin_popcountll((*this)[i]));
- }
- return r;
- }
-
-
-
- boolvec_t operator==(const intvec& x) const
- {
- return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_EQ));
- }
- boolvec_t operator!=(const intvec& x) const
- {
- return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_NE));
- }
- boolvec_t operator<(const intvec& x) const
- {
- return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LT));
- }
- boolvec_t operator<=(const intvec& x) const
- {
- return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LE));
- }
- boolvec_t operator>(const intvec& x) const
- {
- return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GT));
- }
- boolvec_t operator>=(const intvec& x) const
- {
- return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GE));
- }
-
- intvec_t abs() const;
- boolvec_t isignbit() const;
- intvec_t max(intvec_t x) const;
- intvec_t min(intvec_t x) const;
- };
-
-
-
- template<>
- struct realvec<double,8>: floatprops<double>
- {
- static const int size = 8;
- typedef real_t scalar_t;
- typedef __m512d vector_t;
- static const int alignment = sizeof(vector_t);
-
- static const char* name() { return "<MIC:8*double>"; }
- void barrier() { __asm__("": "+x"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(const realvec& x): v(x.v) {}
- // realvec& operator=(const realvec& x) { return v=x.v, *this; }
- realvec(vector_t x): v(x) {}
- realvec(real_t a): v(_mm512_set1_pd(a)) {}
- realvec(const real_t* as)
- {
- v = _mm512_undefined_pd();
- // v = _mm512_loadunpacklo_pd(v, as);
- // v = _mm512_loadunpackhi_pd(v, as+8);
- for (int n=0; n<size; ++n) set_elt(n, as[n]);
- }
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const
- {
- return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
- }
- realvec_t& set_elt(int n, real_t a)
- {
- return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
- }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(const real_t* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return _mm512_load_pd(p);
- }
- static realvec_t loadu(const real_t* p)
- {
- realvec_t r(_mm512_undefined_pd());
- r.v = _mm512_loadunpacklo_pd(r.v, p);
- r.v = _mm512_loadunpackhi_pd(r.v, p+8);
- return r.v;
- }
- static realvec_t loadu(const real_t* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff);
- return loadu(p+ioff);
- }
- realvec_t loada(const real_t* p, const mask_t& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return _mm512_mask_load_pd(v, m.m.v, p);
+template <> struct boolvec<double, 8>;
+template <> struct intvec<double, 8>;
+template <> struct realvec<double, 8>;
+
+template <> struct boolvec<double, 8> : floatprops<double> {
+ static const int size = 8;
+ typedef bool scalar_t;
+ typedef __mask8 bvector_t;
+ static const int alignment = sizeof(bvector_t);
+
+ // static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ // "vector size is wrong");
+
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(const boolvec& x): v(x.v) {}
+ // boolvec& operator=(const boolvec& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v(-bvector_t(a)) {}
+ boolvec(const bool *as)
+ : v((bvector_t(as[0]) << 0) | (bvector_t(as[1]) << 1) |
+ (bvector_t(as[2]) << 2) | (bvector_t(as[3]) << 3) |
+ (bvector_t(as[4]) << 4) | (bvector_t(as[5]) << 5) |
+ (bvector_t(as[6]) << 6) | (bvector_t(as[7]) << 7)) {}
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const { return (v >> n) & 1; }
+ boolvec &set_elt(int n, bool a) {
+ v &= ~(bvector_t(1) << n);
+ v |= bvector_t(a) << n;
+ return *this;
+ }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec operator!() const { return _mm512_knot(v); }
+
+ boolvec operator&&(boolvec x) const { return _mm512_kand(v, x.v); }
+ boolvec operator||(boolvec x) const { return _mm512_kor(v, x.v); }
+ boolvec operator==(boolvec x) const { return _mm512_kxnor(v, x.v); }
+ boolvec operator!=(boolvec x) const { return _mm512_kxor(v, x.v); }
+
+ bool all() const { return _mm512_kortestc(v, v); }
+ bool any() const { return !bool(_mm512_kortestz(v, v)); }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 8> : floatprops<double> {
+ static const int size = 8;
+ typedef int_t scalar_t;
+ typedef __m512i ivector_t;
+ static const int alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(const intvec& x): v(x.v) {}
+ // intvec& operator=(const intvec& x) { return v=x.v, *this; }
+ intvec(ivector_t x) : v(x) {}
+ intvec(int_t a) : v(_mm512_set1_epi64(a)) {}
+ intvec(const int_t *as) {
+ v = _mm512_undefined_epi32();
+ // v = _mm512_loadunpacklo_epi32(v, as);
+ // v = _mm512_loadunpackhi_epi32(v, as+8);
+ for (int n = 0; n < size; ++n)
+ set_elt(n, as[n]);
+ }
+ static intvec iota() {
+ intvec r;
+ for (int n = 0; n < size; ++n)
+ r.set_elt(n, n);
+ return r;
+ }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const {
+ return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+ }
+ intvec_t &set_elt(int n, int_t a) {
+ return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+ }
+
+private:
+ static __mmask8 mask16tomask8(__mmask16 m16) {
+ // combine 01
+ m16 = ((m16 >> 1) | m16) & 0b0011001100110011;
+ // combine 0123
+ m16 = ((m16 >> 2) | m16) & 0b0000111100001111;
+ // combine 01234567
+ m16 = ((m16 >> 4) | m16) & 0b0000000011111111;
+ return m16;
+ }
+
+public:
+ boolvec_t as_bool() const { return convert_bool(); }
+ boolvec_t convert_bool() const {
+ // Result: convert_bool(0)=false, convert_bool(else)=true
+ __mmask16 r16 = _mm512_test_epi32_mask(v, v);
+ return mask16tomask8(r16);
+ }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ // Note: not all arithmetic operations are supported!
+
+ intvec operator+() const { return *this; }
+ intvec operator-() const { return IV(I(0)) - *this; }
+ intvec operator+(intvec x) const { return _mm512_add_epi64(v, x.v); }
+ intvec operator-(intvec x) const { return _mm512_sub_epi64(v, x.v); }
+
+ intvec &operator+=(const intvec &x) { return *this = *this + x; }
+ intvec &operator-=(const intvec &x) { return *this = *this - x; }
+
+ intvec operator~() const { return IV(~U(0)) ^ *this; }
+ intvec operator&(intvec x) const { return _mm512_and_epi64(v, x.v); }
+ intvec operator|(intvec x) const { return _mm512_or_epi64(v, x.v); }
+ intvec operator^(intvec x) const { return _mm512_xor_epi64(v, x.v); }
+
+ intvec &operator&=(const intvec &x) { return *this = *this & x; }
+ intvec &operator|=(const intvec &x) { return *this = *this | x; }
+ intvec &operator^=(const intvec &x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+ intvec lsr(int_t n) const {
+ if (n < 32) {
+ __m512i vlo = _mm512_srli_epi32(v, n);
+ __m512i vhi = _mm512_slli_epi32(v, 32 - n);
+ vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB);
+ return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo);
+ } else {
+ __m512i vlo = _mm512_srli_epi32(v, n - 32);
+ __m512i vhi = _mm512_setzero_epi32();
+ return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo);
}
- realvec_t loadu(const real_t* p, const mask_t& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return m.m.ifthen(loadu(p), *this);
- }
+ }
+ intvec_t rotate(int_t n) const;
+ intvec operator>>(int_t n) const {
+ if (n < 32) {
+ __mm512i vlo = _mm512_srai_epi32(v, n);
+ __mm512i vlo0 = _mm512_srli_epi32(v, n);
+ __mm512i vhi = _mm512_slli_epi32(v, 32 - n);
+ vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB);
+ return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo0);
+ } else {
+ __m512i vlo = _mm512_srai_epi32(v, n - 32);
+ __m512i vhi = _mm512_srai_epi32(v, 31);
+ return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo);
}
- realvec_t loadu(const real_t* p, std::ptrdiff_t ioff, const mask_t& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff, m);
- return loadu(p+ioff, m);
+ }
+ intvec operator<<(int_t n) const {
+ if (n < 32) {
+ __m512i vlo = _mm512_srli_epi32(v, n);
+ __m512i vhi = _mm512_slli_epi32(v, 32 - n);
+ vlo = _mm512_swizzle_epi32(vlo, _MM_SWIZ_REG_CDAB);
+ return _mm512_mask_or_epi32(vhi, 0xb1010101010101010, vhi, vlo);
+ } else {
+ __m512i vlo = _mm512_setzero_epi32();
+ __m512i vhi = _mm512_slli_epi32(v, n - 32);
+ return _mm512_mask_swizzle_epi32(vhi, 0xb1010101010101010, vlo);
}
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- _mm512_store_pd(p, v);
+ }
+ intvec &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec lsr(intvec n) const {
+ // TODO: improve this
+ intvec r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, U((*this)[i]) >> U(n[i]));
}
- void storeu(real_t* p) const
- {
- _mm512_packstorelo_pd(p, v);
- _mm512_packstorehi_pd(p+8, v);
+ return r;
+ }
+ intvec_t rotate(intvec_t n) const;
+ intvec operator>>(intvec n) const {
+ // TODO: improve this
+ intvec r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] >> n[i]);
}
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff);
- storeu(p+ioff);
+ return r;
+ }
+ intvec operator<<(intvec n) const {
+ // TODO: improve this
+ intvec r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] << n[i]);
}
- void storea(real_t* p, const mask_t& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- _mm512_mask_store_pd(p, m.m.v, v);
+ return r;
+ }
+ intvec &operator>>=(intvec n) { return *this = *this >> n; }
+ intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+ intvec_t clz() const {
+ // Return 8*sizeof(TYPE) when the input is 0
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ // __lzcnt64
+ r.set_elt(i, __builtin_clzll((*this)[i]));
}
- void storeu(real_t* p, const mask_t& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- } else {
- for (int n=0; n<size; ++n) {
- if (m.m[n]) p[n] = (*this)[n];
- }
- }
+ return r;
+ }
+ intvec_t popcount() const {
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ // _mm_popcnt_u64
+ r.set_elt(i, __builtin_popcountll((*this)[i]));
}
- void storeu(real_t* p, std::ptrdiff_t ioff, const mask_t& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff, m);
- storeu(p+ioff, m);
+ return r;
+ }
+
+ boolvec_t operator==(const intvec &x) const {
+ return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_EQ));
+ }
+ boolvec_t operator!=(const intvec &x) const {
+ return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_NE));
+ }
+ boolvec_t operator<(const intvec &x) const {
+ return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LT));
+ }
+ boolvec_t operator<=(const intvec &x) const {
+ return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LE));
+ }
+ boolvec_t operator>(const intvec &x) const {
+ return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GT));
+ }
+ boolvec_t operator>=(const intvec &x) const {
+ return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GE));
+ }
+
+ intvec_t abs() const;
+ boolvec_t isignbit() const;
+ intvec_t max(intvec_t x) const;
+ intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 8> : floatprops<double> {
+ static const int size = 8;
+ typedef real_t scalar_t;
+ typedef __m512d vector_t;
+ static const int alignment = sizeof(vector_t);
+
+ static const char *name() { return "<MIC:8*double>"; }
+ void barrier() { __asm__("" : "+x"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(const realvec& x): v(x.v) {}
+ // realvec& operator=(const realvec& x) { return v=x.v, *this; }
+ realvec(vector_t x) : v(x) {}
+ realvec(real_t a) : v(_mm512_set1_pd(a)) {}
+ realvec(const real_t *as) {
+ v = _mm512_undefined_pd();
+ // v = _mm512_loadunpacklo_pd(v, as);
+ // v = _mm512_loadunpackhi_pd(v, as+8);
+ for (int n = 0; n < size; ++n)
+ set_elt(n, as[n]);
+ }
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const {
+ return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+ }
+ realvec_t &set_elt(int n, real_t a) {
+ return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+ }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(const real_t *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return _mm512_load_pd(p);
+ }
+ static realvec_t loadu(const real_t *p) {
+ realvec_t r(_mm512_undefined_pd());
+ r.v = _mm512_loadunpacklo_pd(r.v, p);
+ r.v = _mm512_loadunpackhi_pd(r.v, p + 8);
+ return r.v;
+ }
+ static realvec_t loadu(const real_t *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(const real_t *p, const mask_t &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return _mm512_mask_load_pd(v, m.m.v, p);
+ }
+ realvec_t loadu(const real_t *p, const mask_t &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
}
-
-
-
- intvec_t as_int() const { return _mm512_castpd_si512(v); }
- intvec_t convert_int() const
- {
- intvec_t r(_mm512_undefined_epi32());
- for (int n=0; n<size; ++n) {
- r.set_elt(n, floatprops::convert_int((*this)[n]));
+ }
+ realvec_t loadu(const real_t *p, std::ptrdiff_t ioff, const mask_t &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff, m);
+ return loadu(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ _mm512_store_pd(p, v);
+ }
+ void storeu(real_t *p) const {
+ _mm512_packstorelo_pd(p, v);
+ _mm512_packstorehi_pd(p + 8, v);
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, const mask_t &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ _mm512_mask_store_pd(p, m.m.v, v);
+ }
+ void storeu(real_t *p, const mask_t &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ for (int n = 0; n < size; ++n) {
+ if (m.m[n])
+ p[n] = (*this)[n];
}
- return r;
- }
-
-
-
- realvec operator+() const { return *this; }
- realvec operator-() const { return RV(0.0) - *this; }
-
- realvec operator+(realvec x) const { return _mm512_add_pd(v, x.v); }
- realvec operator-(realvec x) const { return _mm512_sub_pd(v, x.v); }
- realvec operator*(realvec x) const { return _mm512_mul_pd(v, x.v); }
- realvec operator/(realvec x) const { return _mm512_div_pd(v, x.v); }
-
- realvec& operator+=(const realvec& x) { return *this=*this+x; }
- realvec& operator-=(const realvec& x) { return *this=*this-x; }
- realvec& operator*=(const realvec& x) { return *this=*this*x; }
- realvec& operator/=(const realvec& x) { return *this=*this/x; }
-
- real_t maxval() const { returm _mm512_reduce_gmax_pd(v); }
- real_t minval() const { returm _mm512_reduce_gmin_pd(v); }
- real_t prod() const { returm _mm512_reduce_mul_pd(v); }
- real_t sum() const { returm _mm512_reduce_add_pd(v); }
-
-
-
- boolvec_t operator==(const realvec& x) const
- {
- return _mm512_cmp_pd(v, x.v, _CMP_EQ_OQ);
- }
- boolvec_t operator!=(const realvec& x) const
- {
- return _mm512_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
- }
- boolvec_t operator<(const realvec& x) const
- {
- return _mm512_cmp_pd(v, x.v, _CMP_LT_OQ);
- }
- boolvec_t operator<=(const realvec& x) const
- {
- return _mm512_cmp_pd(v, x.v, _CMP_LE_OQ);
- }
- boolvec_t operator>(const realvec& x) const
- {
- return _mm512_cmp_pd(v, x.v, _CMP_GT_OQ);
}
- boolvec_t operator>=(const realvec& x) const
- {
- return _mm512_cmp_pd(v, x.v, _CMP_GE_OQ);
- }
-
-
-
- realvec acos() const { return MF::vml_acos(*this); }
- realvec acosh() const { return MF::vml_acosh(*this); }
- realvec asin() const { return MF::vml_asin(*this); }
- realvec asinh() const { return MF::vml_asinh(*this); }
- realvec atan() const { return MF::vml_atan(*this); }
- realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
- realvec atanh() const { return MF::vml_atanh(*this); }
- realvec cbrt() const { return MF::vml_cbrt(*this); }
- realvec ceil() const { return _mm512_ceil_pd(v); }
- realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
- realvec cos() const { return MF::vml_cos(*this); }
- realvec cosh() const { return MF::vml_cosh(*this); }
- realvec exp() const { return MF::vml_exp(*this); }
- realvec exp10() const { return MF::vml_exp10(*this); }
- realvec exp2() const { return MF::vml_exp2(*this); }
- realvec expm1() const { return MF::vml_expm1(*this); }
- realvec fabs() const { return MF::vml_fabs(*this); }
- realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
- realvec floor() const { return _mm512_floor_pd(v); }
- realvec fma(realvec y, realvec z) const
- {
- return _mm512_fmadd_pd(v, x.v, y.v);
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff, const mask_t &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff, m);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return _mm512_castpd_si512(v); }
+ intvec_t convert_int() const {
+ intvec_t r(_mm512_undefined_epi32());
+ for (int n = 0; n < size; ++n) {
+ r.set_elt(n, floatprops::convert_int((*this)[n]));
}
- realvec fmax(realvec y) const { return _mm512_gmax_pd(v, y.v); }
- realvec fmin(realvec y) const { return _mm512_gmin_pd(v, y.v); }
- realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
- realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
- realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
- intvec_t ilogb() const { return MF::vml_ilogb(*this); }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const
- {
+ return r;
+ }
+
+ realvec operator+() const { return *this; }
+ realvec operator-() const { return RV(0.0) - *this; }
+
+ realvec operator+(realvec x) const { return _mm512_add_pd(v, x.v); }
+ realvec operator-(realvec x) const { return _mm512_sub_pd(v, x.v); }
+ realvec operator*(realvec x) const { return _mm512_mul_pd(v, x.v); }
+ realvec operator/(realvec x) const { return _mm512_div_pd(v, x.v); }
+
+ realvec &operator+=(const realvec &x) { return *this = *this + x; }
+ realvec &operator-=(const realvec &x) { return *this = *this - x; }
+ realvec &operator*=(const realvec &x) { return *this = *this * x; }
+ realvec &operator/=(const realvec &x) { return *this = *this / x; }
+
+ real_t maxval() const { returm _mm512_reduce_gmax_pd(v); }
+ real_t minval() const { returm _mm512_reduce_gmin_pd(v); }
+ real_t prod() const { returm _mm512_reduce_mul_pd(v); }
+ real_t sum() const { returm _mm512_reduce_add_pd(v); }
+
+ boolvec_t operator==(const realvec &x) const {
+ return _mm512_cmp_pd(v, x.v, _CMP_EQ_OQ);
+ }
+ boolvec_t operator!=(const realvec &x) const {
+ return _mm512_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here
+ }
+ boolvec_t operator<(const realvec &x) const {
+ return _mm512_cmp_pd(v, x.v, _CMP_LT_OQ);
+ }
+ boolvec_t operator<=(const realvec &x) const {
+ return _mm512_cmp_pd(v, x.v, _CMP_LE_OQ);
+ }
+ boolvec_t operator>(const realvec &x) const {
+ return _mm512_cmp_pd(v, x.v, _CMP_GT_OQ);
+ }
+ boolvec_t operator>=(const realvec &x) const {
+ return _mm512_cmp_pd(v, x.v, _CMP_GE_OQ);
+ }
+
+ realvec acos() const { return MF::vml_acos(*this); }
+ realvec acosh() const { return MF::vml_acosh(*this); }
+ realvec asin() const { return MF::vml_asin(*this); }
+ realvec asinh() const { return MF::vml_asinh(*this); }
+ realvec atan() const { return MF::vml_atan(*this); }
+ realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+ realvec atanh() const { return MF::vml_atanh(*this); }
+ realvec cbrt() const { return MF::vml_cbrt(*this); }
+ realvec ceil() const { return _mm512_ceil_pd(v); }
+ realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+ realvec cos() const { return MF::vml_cos(*this); }
+ realvec cosh() const { return MF::vml_cosh(*this); }
+ realvec exp() const { return MF::vml_exp(*this); }
+ realvec exp10() const { return MF::vml_exp10(*this); }
+ realvec exp2() const { return MF::vml_exp2(*this); }
+ realvec expm1() const { return MF::vml_expm1(*this); }
+ realvec fabs() const { return MF::vml_fabs(*this); }
+ realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+ realvec floor() const { return _mm512_floor_pd(v); }
+ realvec fma(realvec y, realvec z) const {
+ return _mm512_fmadd_pd(v, x.v, y.v);
+ }
+ realvec fmax(realvec y) const { return _mm512_gmax_pd(v, y.v); }
+ realvec fmin(realvec y) const { return _mm512_gmin_pd(v, y.v); }
+ realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+ realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+ realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const {
#ifdef VML_HAVE_NAN
- return _mm512_cmp_pd(v, v, _CMP_UNORD_Q);
+ return _mm512_cmp_pd(v, v, _CMP_UNORD_Q);
#else
- return BV(false);
+ return BV(false);
#endif
- }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
- realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
- realvec log() const { return MF::vml_log(*this); }
- realvec log10() const { return MF::vml_log10(*this); }
- realvec log1p() const { return MF::vml_log1p(*this); }
- realvec log2() const { return MF::vml_log2(*this); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return _mm512_fmadd_pd(v, x.v, y.v);
- }
- realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
- realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
- realvec rcp() const { return _mm512_div_pd(_mm512_set1_pd(1.0), v); }
- realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
- realvec rint() const
- {
- return _mm512_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
- }
- realvec round() const { return MF::vml_round(*this); }
- realvec rsqrt() const { return MF::vml_rsqrt(*this); }
- boolvec_t signbit() const { return as_int().signbit(); }
- realvec sin() const { return MF::vml_sin(*this); }
- realvec sinh() const { return MF::vml_sinh(*this); }
- realvec sqrt() const { return _mm512_sqrt_pd(v); }
- realvec tan() const { return MF::vml_tan(*this); }
- realvec tanh() const { return MF::vml_tanh(*this); }
- realvec trunc() const { return _mm512_round_pd(v, _MM_FROUND_TO_ZERO); }
- };
-
-
-
- // boolvec definitions
-
- inline intvec<double,4> boolvec<double,4>::as_int() const
- {
- return _mm512_castpd_si512(v);
- }
-
- inline intvec<double,4> boolvec<double,4>::convert_int() const
- {
- return ifthen(v, IV(I(1)), IV(I(0)));
- }
-
- inline
- boolvec<double,4> boolvec<double,4>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return (v & x.v) | (~v & y.v);
- }
-
- inline
- intvec<double,4> boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const
- {
- return _mm512_blend_epi64(v, y.v, x.v)
- }
-
- inline
- realvec<double,4> boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const
- {
- return _mm512_blend_pd(v, y.v, x.v)
- }
-
-
-
- // intvec definitions
-
- inline realvec<double,4> intvec<double,4>::as_float() const
- {
- return _mm512_castsi512_pd(v);
- }
-
- inline realvec<double,4> intvec<double,4>::convert_float() const
- {
- intvec_t r(_mm512_undefined_pd());
- for (int n=0; n<size; ++n) {
- r.set_elt(n, floatprops::convert_float((*this)[n]));
- }
- return r;
}
-
- inline intvec<double,8> intvec<double,8>::abs() const
- {
- return MF::vml_abs(*this);
- }
-
- inline intvec<double,8> intvec<double,8>::bitifthen(intvec_t x,
- intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
- inline boolvec<double,8> intvec<double,8>::isignbit() const
- {
- return MF::vml_isignbit(*this);
- }
-
- inline intvec<double,8> intvec<double,8>::max(intvec_t x) const
- {
- return MF::vml_max(*this, x);
- }
-
- inline intvec<double,8> intvec<double,8>::min(intvec_t x) const
- {
- return MF::vml_min(*this, x);
- }
-
- inline intvec<double,8> intvec<double,8>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
- inline intvec<double,8> intvec<double,8>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec log() const { return MF::vml_log(*this); }
+ realvec log10() const { return MF::vml_log10(*this); }
+ realvec log1p() const { return MF::vml_log1p(*this); }
+ realvec log2() const { return MF::vml_log2(*this); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return _mm512_fmadd_pd(v, x.v, y.v);
+ }
+ realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+ realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+ realvec rcp() const { return _mm512_div_pd(_mm512_set1_pd(1.0), v); }
+ realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+ realvec rint() const { return _mm512_round_pd(v, _MM_FROUND_TO_NEAREST_INT); }
+ realvec round() const { return MF::vml_round(*this); }
+ realvec rsqrt() const { return MF::vml_rsqrt(*this); }
+ boolvec_t signbit() const { return as_int().signbit(); }
+ realvec sin() const { return MF::vml_sin(*this); }
+ realvec sinh() const { return MF::vml_sinh(*this); }
+ realvec sqrt() const { return _mm512_sqrt_pd(v); }
+ realvec tan() const { return MF::vml_tan(*this); }
+ realvec tanh() const { return MF::vml_tanh(*this); }
+ realvec trunc() const { return _mm512_round_pd(v, _MM_FROUND_TO_ZERO); }
+};
+
+// boolvec definitions
+
+inline intvec<double, 4> boolvec<double, 4>::as_int() const {
+ return _mm512_castpd_si512(v);
+}
+
+inline intvec<double, 4> boolvec<double, 4>::convert_int() const {
+ return ifthen(v, IV(I(1)), IV(I(0)));
+}
+
+inline boolvec<double, 4> boolvec<double, 4>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return (v & x.v) | (~v & y.v);
+}
+
+inline intvec<double, 4> boolvec<double, 4>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return _mm512_blend_epi64(v, y.v, x.v)
+}
+
+inline realvec<double, 4> boolvec<double, 4>::ifthen(realvec_t x,
+ realvec_t y) const {
+ return _mm512_blend_pd(v, y.v, x.v)
+}
+
+// intvec definitions
+
+inline realvec<double, 4> intvec<double, 4>::as_float() const {
+ return _mm512_castsi512_pd(v);
+}
+
+inline realvec<double, 4> intvec<double, 4>::convert_float() const {
+ intvec_t r(_mm512_undefined_pd());
+ for (int n = 0; n < size; ++n) {
+ r.set_elt(n, floatprops::convert_float((*this)[n]));
+ }
+ return r;
+}
+
+inline intvec<double, 8> intvec<double, 8>::abs() const {
+ return MF::vml_abs(*this);
+}
+
+inline intvec<double, 8> intvec<double, 8>::bitifthen(intvec_t x,
+ intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+}
+
+inline boolvec<double, 8> intvec<double, 8>::isignbit() const {
+ return MF::vml_isignbit(*this);
+}
+
+inline intvec<double, 8> intvec<double, 8>::max(intvec_t x) const {
+ return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 8> intvec<double, 8>::min(intvec_t x) const {
+ return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 8> intvec<double, 8>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 8> intvec<double, 8>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_MIC_DOUBLE8_H
+#endif // #ifndef VEC_MIC_DOUBLE8_H
diff --git a/vec_neon_float2.h b/vec_neon_float2.h
index 3a21a05..6df9969 100644
--- a/vec_neon_float2.h
+++ b/vec_neon_float2.h
@@ -14,608 +14,511 @@
// Neon intrinsics
#include <arm_neon.h>
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_FLOAT_2
- template<> struct boolvec<float,2>;
- template<> struct intvec<float,2>;
- template<> struct realvec<float,2>;
-
-
-
- template<>
- struct boolvec<float,2>: floatprops<float>
- {
- static int const size = 2;
- typedef bool scalar_t;
- typedef uint32x2_t bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- private:
- // true values are -1, false values are 0
- static uint_t from_bool(bool a) { return -int_t(a); }
- static bool to_bool(uint_t a) { return a; }
- public:
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(boolvec const& x): v(x.v) {}
- // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a): v(vdup_n_u32(from_bool(a))) {}
- boolvec(bool const* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const
- {
- return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
- }
- boolvec& set_elt(int n, bool a)
- {
- return
- vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
- }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec operator!() const { return vmvn_u32(v); }
-
- boolvec operator&&(boolvec x) const { return vand_u32(v, x.v); }
- boolvec operator||(boolvec x) const { return vorr_u32(v, x.v); }
- boolvec operator==(boolvec x) const { return vceq_u32(v, x.v); }
- boolvec operator!=(boolvec x) const { return veor_u32(v, x.v); }
-
- bool all() const
- {
- boolvec r = vpmin_u32(v, v);
- return r[0];
- }
- bool any() const
- {
- boolvec r = vpmax_u32(v, v);
- return r[0];
- }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<float,2>: floatprops<float>
- {
- static int const size = 2;
- typedef int_t scalar_t;
- typedef int32x2_t ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(intvec const& x): v(x.v) {}
- // intvec& operator=(intvec const& x) { return v=x.v, *this; }
- intvec(ivector_t x): v(x) {}
- intvec(int_t a): v(vdup_n_s32(a)) {}
- intvec(int_t const* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
- static intvec iota()
- {
- return vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0));
- }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const
- {
- return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
- }
- intvec_t& set_elt(int n, int_t a)
- {
- return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
- }
-
-
-
- // Vector casts do not change the bit battern
- boolvec_t as_bool() const { return vreinterpret_u32_s32(v); }
- boolvec_t convert_bool() const { return *this != IV(0); }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- intvec operator+() const { return *this; }
- intvec operator-() const { return vneg_s32(v); }
-
- intvec operator+(intvec x) const { return vadd_s32(v, x.v); }
- intvec operator-(intvec x) const { return vsub_s32(v, x.v); }
- intvec operator*(intvec x) const { return vmul_s32(v, x.v); }
-
- intvec& operator+=(intvec const& x) { return *this=*this+x; }
- intvec& operator-=(intvec const& x) { return *this=*this-x; }
- intvec& operator*=(intvec const& x) { return *this=*this*x; }
-
-
-
- intvec operator~() const { return vmvn_s32(v); }
-
- intvec operator&(intvec x) const { return vand_s32(v, x.v); }
- intvec operator|(intvec x) const { return vorr_s32(v, x.v); }
- intvec operator^(intvec x) const { return veor_s32(v, x.v); }
-
- intvec& operator&=(intvec const& x) { return *this=*this&x; }
- intvec& operator|=(intvec const& x) { return *this=*this|x; }
- intvec& operator^=(intvec const& x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const
- {
- return vbsl_s32(vreinterpret_u32_s32(v), x.v, y.v);
- }
-
-
-
- intvec_t lsr(int_t n) const { return lsr(IV(n)); }
- intvec_t rotate(int_t n) const;
- intvec operator>>(int_t n) const { return *this >> IV(n); }
- intvec operator<<(int_t n) const { return *this << IV(n); }
- intvec& operator>>=(int_t n) { return *this=*this>>n; }
- intvec& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec lsr(intvec n) const
- {
- return vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v), (-n).v));
- }
- intvec_t rotate(intvec_t n) const;
- intvec operator>>(intvec n) const
- {
- return vshl_s32(v, (-n).v);
- }
- intvec operator<<(intvec n) const
- {
- return vshl_s32(v, n.v);
- }
- intvec& operator>>=(intvec n) { return *this=*this>>n; }
- intvec& operator<<=(intvec n) { return *this=*this<<n; }
-
- intvec_t clz() const { return vclz_s32(v); }
- intvec_t popcount() const
- {
- return vpaddl_s16(vpaddl_s8(vcnt_s8(vreinterpret_s8_s32(v))));
- }
-
-
-
- boolvec_t operator==(intvec const& x) const { return vceq_s32(v, x.v); }
- boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
- boolvec_t operator<(intvec const& x) const { return vclt_s32(v, x.v); }
- boolvec_t operator<=(intvec const& x) const { return vcle_s32(v, x.v); }
- boolvec_t operator>(intvec const& x) const { return vcgt_s32(v, x.v); }
- boolvec_t operator>=(intvec const& x) const { return vcge_s32(v, x.v); }
-
- intvec_t abs() const { return vabs_s32(v); }
- boolvec_t isignbit() const
- {
- //return *this < IV(I(0));
- return intvec(vshr_n_s32(v, FP::bits-1)).as_bool();
- }
- intvec_t max(intvec_t x) const { return vmax_s32(v, x.v); }
- intvec_t min(intvec_t x) const { return vmin_s32(v, x.v); }
- };
-
-
-
- template<>
- struct realvec<float,2>: floatprops<float>
- {
- static int const size = 2;
- typedef real_t scalar_t;
- typedef float32x2_t vector_t;
- static int const alignment = sizeof(vector_t);
-
- static char const* name() { return "<NEON:2*float>"; }
- void barrier() { __asm__("": "+w"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(realvec const& x): v(x.v) {}
- // realvec& operator=(realvec const& x) { return v=x.v, *this; }
- realvec(vector_t x): v(x) {}
- realvec(real_t a): v(vdup_n_f32(a)) {}
- realvec(real_t const* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const
- {
- return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
- }
- realvec_t& set_elt(int n, real_t a)
- {
- return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
- }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return vld1_f32(p);
- }
- static realvec_t loadu(real_t const* p)
- {
+template <> struct boolvec<float, 2>;
+template <> struct intvec<float, 2>;
+template <> struct realvec<float, 2>;
+
+template <> struct boolvec<float, 2> : floatprops<float> {
+ static int const size = 2;
+ typedef bool scalar_t;
+ typedef uint32x2_t bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+private:
+ // true values are -1, false values are 0
+ static uint_t from_bool(bool a) { return -int_t(a); }
+ static bool to_bool(uint_t a) { return a; }
+
+public:
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v(vdup_n_u32(from_bool(a))) {}
+ boolvec(bool const *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const {
+ return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+ }
+ boolvec &set_elt(int n, bool a) {
+ return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+ *this;
+ }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec operator!() const { return vmvn_u32(v); }
+
+ boolvec operator&&(boolvec x) const { return vand_u32(v, x.v); }
+ boolvec operator||(boolvec x) const { return vorr_u32(v, x.v); }
+ boolvec operator==(boolvec x) const { return vceq_u32(v, x.v); }
+ boolvec operator!=(boolvec x) const { return veor_u32(v, x.v); }
+
+ bool all() const {
+ boolvec r = vpmin_u32(v, v);
+ return r[0];
+ }
+ bool any() const {
+ boolvec r = vpmax_u32(v, v);
+ return r[0];
+ }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 2> : floatprops<float> {
+ static int const size = 2;
+ typedef int_t scalar_t;
+ typedef int32x2_t ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x) : v(x) {}
+ intvec(int_t a) : v(vdup_n_s32(a)) {}
+ intvec(int_t const *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+ static intvec iota() {
+ return vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0));
+ }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const {
+ return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+ }
+ intvec_t &set_elt(int n, int_t a) {
+ return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+ }
+
+ // Vector casts do not change the bit battern
+ boolvec_t as_bool() const { return vreinterpret_u32_s32(v); }
+ boolvec_t convert_bool() const { return *this != IV(0); }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ intvec operator+() const { return *this; }
+ intvec operator-() const { return vneg_s32(v); }
+
+ intvec operator+(intvec x) const { return vadd_s32(v, x.v); }
+ intvec operator-(intvec x) const { return vsub_s32(v, x.v); }
+ intvec operator*(intvec x) const { return vmul_s32(v, x.v); }
+
+ intvec &operator+=(intvec const &x) { return *this = *this + x; }
+ intvec &operator-=(intvec const &x) { return *this = *this - x; }
+ intvec &operator*=(intvec const &x) { return *this = *this * x; }
+
+ intvec operator~() const { return vmvn_s32(v); }
+
+ intvec operator&(intvec x) const { return vand_s32(v, x.v); }
+ intvec operator|(intvec x) const { return vorr_s32(v, x.v); }
+ intvec operator^(intvec x) const { return veor_s32(v, x.v); }
+
+ intvec &operator&=(intvec const &x) { return *this = *this & x; }
+ intvec &operator|=(intvec const &x) { return *this = *this | x; }
+ intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const {
+ return vbsl_s32(vreinterpret_u32_s32(v), x.v, y.v);
+ }
+
+ intvec_t lsr(int_t n) const { return lsr(IV(n)); }
+ intvec_t rotate(int_t n) const;
+ intvec operator>>(int_t n) const { return *this >> IV(n); }
+ intvec operator<<(int_t n) const { return *this << IV(n); }
+ intvec &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec lsr(intvec n) const {
+ return vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v), (-n).v));
+ }
+ intvec_t rotate(intvec_t n) const;
+ intvec operator>>(intvec n) const { return vshl_s32(v, (-n).v); }
+ intvec operator<<(intvec n) const { return vshl_s32(v, n.v); }
+ intvec &operator>>=(intvec n) { return *this = *this >> n; }
+ intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+ intvec_t clz() const { return vclz_s32(v); }
+ intvec_t popcount() const {
+ return vpaddl_s16(vpaddl_s8(vcnt_s8(vreinterpret_s8_s32(v))));
+ }
+
+ boolvec_t operator==(intvec const &x) const { return vceq_s32(v, x.v); }
+ boolvec_t operator!=(intvec const &x) const { return !(*this == x); }
+ boolvec_t operator<(intvec const &x) const { return vclt_s32(v, x.v); }
+ boolvec_t operator<=(intvec const &x) const { return vcle_s32(v, x.v); }
+ boolvec_t operator>(intvec const &x) const { return vcgt_s32(v, x.v); }
+ boolvec_t operator>=(intvec const &x) const { return vcge_s32(v, x.v); }
+
+ intvec_t abs() const { return vabs_s32(v); }
+ boolvec_t isignbit() const {
+ // return *this < IV(I(0));
+ return intvec(vshr_n_s32(v, FP::bits - 1)).as_bool();
+ }
+ intvec_t max(intvec_t x) const { return vmax_s32(v, x.v); }
+ intvec_t min(intvec_t x) const { return vmin_s32(v, x.v); }
+};
+
+template <> struct realvec<float, 2> : floatprops<float> {
+ static int const size = 2;
+ typedef real_t scalar_t;
+ typedef float32x2_t vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static char const *name() { return "<NEON:2*float>"; }
+ void barrier() { __asm__("" : "+w"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x) : v(x) {}
+ realvec(real_t a) : v(vdup_n_f32(a)) {}
+ realvec(real_t const *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const {
+ return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+ }
+ realvec_t &set_elt(int n, real_t a) {
+ return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+ }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return vld1_f32(p);
+ }
+ static realvec_t loadu(real_t const *p) {
#if defined __ARM_FEATURE_UNALIGNED
- return vld1_f32(p);
+ return vld1_f32(p);
#else
- realvec_t r;
- r.set_elt(0, p[0]);
- r.set_elt(1, p[1]);
- return r;
+ realvec_t r;
+ r.set_elt(0, p[0]);
+ r.set_elt(1, p[1]);
+ return r;
#endif
+ }
+ static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
}
- static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff);
- return loadu(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return m.m.ifthen(loada(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return m.m.ifthen(loadu(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff, m);
- return loadu(p+ioff, m);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- vst1_f32(p, v);
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
}
- void storeu(real_t* p) const
- {
- // Vector stores would require vector loads, which would need to
- // be atomic
+ }
+ realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff, m);
+ return loadu(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ vst1_f32(p, v);
+ }
+ void storeu(real_t *p) const {
+// Vector stores would require vector loads, which would need to
+// be atomic
#if defined __ARM_FEATURE_UNALIGNED
- vst1_f32(p, v);
+ vst1_f32(p, v);
#else
- p[0] = (*this)[0];
- p[1] = (*this)[1];
+ p[0] = (*this)[0];
+ p[1] = (*this)[1];
#endif
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ if (m.m[0])
+ p[0] = (*this)[0];
+ if (m.m[1])
+ p[1] = (*this)[1];
}
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff);
- storeu(p+ioff);
- }
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- } else {
- if (m.m[0]) p[0] = (*this)[0];
- if (m.m[1]) p[1] = (*this)[1];
- }
- }
- void storeu(real_t* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- } else {
- if (m.m[0]) p[0] = (*this)[0];
- if (m.m[1]) p[1] = (*this)[1];
- }
- }
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff, m);
- storeu(p+ioff, m);
- }
-
-
-
- intvec_t as_int() const { return vreinterpret_s32_f32(v); }
- intvec_t convert_int() const { return vcvt_s32_f32(v); }
-
-
-
- realvec operator+() const { return *this; }
- realvec operator-() const { return vneg_f32(v); }
-
- realvec operator+(realvec x) const { return vadd_f32(v, x.v); }
- realvec operator-(realvec x) const { return vsub_f32(v, x.v); }
- realvec operator*(realvec x) const { return vmul_f32(v, x.v); }
- realvec operator/(realvec x) const { return *this * x.rcp(); }
-
- realvec& operator+=(realvec const& x) { return *this=*this+x; }
- realvec& operator-=(realvec const& x) { return *this=*this-x; }
- realvec& operator*=(realvec const& x) { return *this=*this*x; }
- realvec& operator/=(realvec const& x) { return *this=*this/x; }
-
- real_t maxval() const
- {
- realvec r = vpmax_f32(v, v);
- return r[0];
- }
- real_t minval() const
- {
- realvec r = vpmin_f32(v, v);
- return r[0];
- }
- real_t prod() const
- {
- return (*this)[0] * (*this)[1];
- }
- real_t sum() const
- {
- realvec r = vpadd_f32(v, v);
- return r[0];
- }
-
-
-
- boolvec_t operator==(realvec const& x) const { return vceq_f32(v, x.v); }
- boolvec_t operator!=(realvec const& x) const { return !(*this == x); }
- boolvec_t operator<(realvec const& x) const { return vclt_f32(v, x.v); }
- boolvec_t operator<=(realvec const& x) const { return vcle_f32(v, x.v); }
- boolvec_t operator>(realvec const& x) const { return vcgt_f32(v, x.v); }
- boolvec_t operator>=(realvec const& x) const { return vcge_f32(v, x.v); }
-
-
-
- realvec acos() const { return MF::vml_acos(*this); }
- realvec acosh() const { return MF::vml_acosh(*this); }
- realvec asin() const { return MF::vml_asin(*this); }
- realvec asinh() const { return MF::vml_asinh(*this); }
- realvec atan() const { return MF::vml_atan(*this); }
- realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
- realvec atanh() const { return MF::vml_atanh(*this); }
- realvec cbrt() const { return MF::vml_cbrt(*this); }
- realvec ceil() const
- {
- // return vrndp_f32(v);
- return MF::vml_ceil(*this);
- }
- realvec copysign(realvec y) const
- {
- return vbsl_f32(vdup_n_u32(FP::signbit_mask), y.v, v);
- }
- realvec cos() const { return MF::vml_cos(*this); }
- realvec cosh() const { return MF::vml_cosh(*this); }
- realvec exp() const { return MF::vml_exp(*this); }
- realvec exp10() const { return MF::vml_exp10(*this); }
- realvec exp2() const { return MF::vml_exp2(*this); }
- realvec expm1() const { return MF::vml_expm1(*this); }
- realvec fabs() const { return vabs_f32(v); }
- realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
- realvec floor() const
- {
- // return vrndm_f32(v);
- return MF::vml_floor(*this);
- }
- realvec_t fma(realvec_t y, realvec_t z) const
- {
- return vfma_f32(z.v, v, y.v);
- }
- realvec fmax(realvec y) const { return vmax_f32(v, y.v); }
- realvec fmin(realvec y) const { return vmin_f32(v, y.v); }
- realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
- realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
- realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
- intvec_t ilogb() const { return MF::vml_ilogb(*this); }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const { return MF::vml_isnan(*this); }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
- realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
- realvec log() const { return MF::vml_log(*this); }
- realvec log10() const { return MF::vml_log10(*this); }
- realvec log1p() const { return MF::vml_log1p(*this); }
- realvec log2() const { return MF::vml_log2(*this); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- // TODO: vfma_f32
- return vmla_f32(z.v, v, y.v);
- }
- realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
- realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
- realvec rcp() const
- {
- realvec r = vrecpe_f32(v);
- r *= vrecps_f32(v, r);
- r *= vrecps_f32(v, r);
- return r;
- }
- realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
- realvec rint() const
- {
- // return vrndn_f32(v);
- return MF::vml_rint(*this);
- }
- realvec round() const
- {
- // return vrnda_f32(v);
- return MF::vml_round(*this);
- }
- realvec rsqrt() const
- {
- realvec r = vrsqrte_f32(v);
- r *= vrsqrts_f32(v, r*r);
- r *= vrsqrts_f32(v, r*r);
- return r;
- }
- boolvec_t signbit() const { return MF::vml_signbit(*this); }
- realvec sin() const { return MF::vml_sin(*this); }
- realvec sinh() const { return MF::vml_sinh(*this); }
- realvec sqrt() const { return *this * rsqrt(); }
- realvec tan() const { return MF::vml_tan(*this); }
- realvec tanh() const { return MF::vml_tanh(*this); }
- realvec trunc() const
- {
- // return vrnd_f32(v);
- return MF::vml_trunc(*this);
+ }
+ void storeu(real_t *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ if (m.m[0])
+ p[0] = (*this)[0];
+ if (m.m[1])
+ p[1] = (*this)[1];
}
- };
-
-
-
- // boolvec definitions
-
- inline intvec<float,2> boolvec<float,2>::as_int() const
- {
- return vreinterpret_s32_u32(v);
- }
-
- inline intvec<float,2> boolvec<float,2>::convert_int() const
- {
- return - as_int();
- }
-
- inline
- boolvec<float,2> boolvec<float,2>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return vbsl_u32(v, x.v, y.v);
- }
-
- inline intvec<float,2> boolvec<float,2>::ifthen(intvec_t x, intvec_t y) const
- {
- return vbsl_s32(v, x.v, y.v);
- }
-
- inline
- realvec<float,2> boolvec<float,2>::ifthen(realvec_t x, realvec_t y) const
- {
- return vbsl_f32(v, x.v, y.v);
- }
-
-
-
- // intvec definitions
-
- inline realvec<float,2> intvec<float,2>::as_float() const
- {
- return vreinterpret_f32_s32(v);
- }
-
- inline realvec<float,2> intvec<float,2>::convert_float() const
- {
- return vcvt_f32_s32(v);
- }
-
- inline intvec<float,2> intvec<float,2>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
- inline intvec<float,2> intvec<float,2>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff, m);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return vreinterpret_s32_f32(v); }
+ intvec_t convert_int() const { return vcvt_s32_f32(v); }
+
+ realvec operator+() const { return *this; }
+ realvec operator-() const { return vneg_f32(v); }
+
+ realvec operator+(realvec x) const { return vadd_f32(v, x.v); }
+ realvec operator-(realvec x) const { return vsub_f32(v, x.v); }
+ realvec operator*(realvec x) const { return vmul_f32(v, x.v); }
+ realvec operator/(realvec x) const { return *this * x.rcp(); }
+
+ realvec &operator+=(realvec const &x) { return *this = *this + x; }
+ realvec &operator-=(realvec const &x) { return *this = *this - x; }
+ realvec &operator*=(realvec const &x) { return *this = *this * x; }
+ realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+ real_t maxval() const {
+ realvec r = vpmax_f32(v, v);
+ return r[0];
+ }
+ real_t minval() const {
+ realvec r = vpmin_f32(v, v);
+ return r[0];
+ }
+ real_t prod() const { return (*this)[0] * (*this)[1]; }
+ real_t sum() const {
+ realvec r = vpadd_f32(v, v);
+ return r[0];
+ }
+
+ boolvec_t operator==(realvec const &x) const { return vceq_f32(v, x.v); }
+ boolvec_t operator!=(realvec const &x) const { return !(*this == x); }
+ boolvec_t operator<(realvec const &x) const { return vclt_f32(v, x.v); }
+ boolvec_t operator<=(realvec const &x) const { return vcle_f32(v, x.v); }
+ boolvec_t operator>(realvec const &x) const { return vcgt_f32(v, x.v); }
+ boolvec_t operator>=(realvec const &x) const { return vcge_f32(v, x.v); }
+
+ realvec acos() const { return MF::vml_acos(*this); }
+ realvec acosh() const { return MF::vml_acosh(*this); }
+ realvec asin() const { return MF::vml_asin(*this); }
+ realvec asinh() const { return MF::vml_asinh(*this); }
+ realvec atan() const { return MF::vml_atan(*this); }
+ realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+ realvec atanh() const { return MF::vml_atanh(*this); }
+ realvec cbrt() const { return MF::vml_cbrt(*this); }
+ realvec ceil() const {
+ // return vrndp_f32(v);
+ return MF::vml_ceil(*this);
+ }
+ realvec copysign(realvec y) const {
+ return vbsl_f32(vdup_n_u32(FP::signbit_mask), y.v, v);
+ }
+ realvec cos() const { return MF::vml_cos(*this); }
+ realvec cosh() const { return MF::vml_cosh(*this); }
+ realvec exp() const { return MF::vml_exp(*this); }
+ realvec exp10() const { return MF::vml_exp10(*this); }
+ realvec exp2() const { return MF::vml_exp2(*this); }
+ realvec expm1() const { return MF::vml_expm1(*this); }
+ realvec fabs() const { return vabs_f32(v); }
+ realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+ realvec floor() const {
+ // return vrndm_f32(v);
+ return MF::vml_floor(*this);
+ }
+ realvec_t fma(realvec_t y, realvec_t z) const {
+ return vfma_f32(z.v, v, y.v);
+ }
+ realvec fmax(realvec y) const { return vmax_f32(v, y.v); }
+ realvec fmin(realvec y) const { return vmin_f32(v, y.v); }
+ realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+ realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+ realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const { return MF::vml_isnan(*this); }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec log() const { return MF::vml_log(*this); }
+ realvec log10() const { return MF::vml_log10(*this); }
+ realvec log1p() const { return MF::vml_log1p(*this); }
+ realvec log2() const { return MF::vml_log2(*this); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ // TODO: vfma_f32
+ return vmla_f32(z.v, v, y.v);
+ }
+ realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+ realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+ realvec rcp() const {
+ realvec r = vrecpe_f32(v);
+ r *= vrecps_f32(v, r);
+ r *= vrecps_f32(v, r);
+ return r;
+ }
+ realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+ realvec rint() const {
+ // return vrndn_f32(v);
+ return MF::vml_rint(*this);
+ }
+ realvec round() const {
+ // return vrnda_f32(v);
+ return MF::vml_round(*this);
+ }
+ realvec rsqrt() const {
+ realvec r = vrsqrte_f32(v);
+ r *= vrsqrts_f32(v, r * r);
+ r *= vrsqrts_f32(v, r * r);
+ return r;
+ }
+ boolvec_t signbit() const { return MF::vml_signbit(*this); }
+ realvec sin() const { return MF::vml_sin(*this); }
+ realvec sinh() const { return MF::vml_sinh(*this); }
+ realvec sqrt() const { return *this * rsqrt(); }
+ realvec tan() const { return MF::vml_tan(*this); }
+ realvec tanh() const { return MF::vml_tanh(*this); }
+ realvec trunc() const {
+ // return vrnd_f32(v);
+ return MF::vml_trunc(*this);
+ }
+};
+
+// boolvec definitions
+
+inline intvec<float, 2> boolvec<float, 2>::as_int() const {
+ return vreinterpret_s32_u32(v);
+}
+
+inline intvec<float, 2> boolvec<float, 2>::convert_int() const {
+ return -as_int();
+}
+
+inline boolvec<float, 2> boolvec<float, 2>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return vbsl_u32(v, x.v, y.v);
+}
+
+inline intvec<float, 2> boolvec<float, 2>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return vbsl_s32(v, x.v, y.v);
+}
+
+inline realvec<float, 2> boolvec<float, 2>::ifthen(realvec_t x,
+ realvec_t y) const {
+ return vbsl_f32(v, x.v, y.v);
+}
+
+// intvec definitions
+
+inline realvec<float, 2> intvec<float, 2>::as_float() const {
+ return vreinterpret_f32_s32(v);
+}
+
+inline realvec<float, 2> intvec<float, 2>::convert_float() const {
+ return vcvt_f32_s32(v);
+}
+
+inline intvec<float, 2> intvec<float, 2>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 2> intvec<float, 2>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_NEON_FLOAT2_H
+#endif // #ifndef VEC_NEON_FLOAT2_H
diff --git a/vec_neon_float4.h b/vec_neon_float4.h
index 2bd9dda..9ec1e79 100644
--- a/vec_neon_float4.h
+++ b/vec_neon_float4.h
@@ -14,628 +14,537 @@
// Neon intrinsics
#include <arm_neon.h>
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_FLOAT_4
- template<> struct boolvec<float,4>;
- template<> struct intvec<float,4>;
- template<> struct realvec<float,4>;
-
-
-
- template<>
- struct boolvec<float,4>: floatprops<float>
- {
- static int const size = 4;
- typedef bool scalar_t;
- typedef uint32x4_t bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- private:
- // true values are -1, false values are 0
- static uint_t from_bool(bool a) { return -int_t(a); }
- static bool to_bool(uint_t a) { return a; }
- public:
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(boolvec const& x): v(x.v) {}
- // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a): v(vdupq_n_u32(from_bool(a))) {}
- boolvec(bool const* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const
- {
- return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
- }
- boolvec& set_elt(int n, bool a)
- {
- return
- vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
- }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec operator!() const { return vmvnq_u32(v); }
-
- boolvec operator&&(boolvec x) const { return vandq_u32(v, x.v); }
- boolvec operator||(boolvec x) const { return vorrq_u32(v, x.v); }
- boolvec operator==(boolvec x) const { return vceqq_u32(v, x.v); }
- boolvec operator!=(boolvec x) const { return veorq_u32(v, x.v); }
-
- bool all() const
- {
- uint32x2_t x = vpmin_u32(vget_low_u32(v), vget_high_u32(v));
- uint32x2_t y = vpmin_u32(x, x);
- uint32_t z = vget_lane_u32(y, 0);
- return to_bool(z);
- }
- bool any() const
- {
- uint32x2_t x = vpmax_u32(vget_low_u32(v), vget_high_u32(v));
- uint32x2_t y = vpmax_u32(x, x);
- uint32_t z = vget_lane_u32(y, 0);
- return to_bool(z);
- }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<float,4>: floatprops<float>
- {
- static int const size = 4;
- typedef int_t scalar_t;
- typedef int32x4_t ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(intvec const& x): v(x.v) {}
- // intvec& operator=(intvec const& x) { return v=x.v, *this; }
- intvec(ivector_t x): v(x) {}
- intvec(int_t a): v(vdupq_n_s32(a)) {}
- intvec(int_t const* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
- static intvec iota()
- {
- return
- vcombine_s32(vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0)),
- vcreate_s32((uint64_t(3) << uint64_t(32)) | uint64_t(2)));
- }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const
- {
- return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
- }
- intvec_t& set_elt(int n, int_t a)
- {
- return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
- }
-
-
-
- // Vector casts do not change the bit battern
- boolvec_t as_bool() const { return vreinterpretq_u32_s32(v); }
- boolvec_t convert_bool() const { return *this != IV(0); }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- intvec operator+() const { return *this; }
- intvec operator-() const { return vnegq_s32(v); }
-
- intvec operator+(intvec x) const { return vaddq_s32(v, x.v); }
- intvec operator-(intvec x) const { return vsubq_s32(v, x.v); }
- intvec operator*(intvec x) const { return vmulq_s32(v, x.v); }
-
- intvec& operator+=(intvec const& x) { return *this=*this+x; }
- intvec& operator-=(intvec const& x) { return *this=*this-x; }
- intvec& operator*=(intvec const& x) { return *this=*this*x; }
-
-
-
- intvec operator~() const { return vmvnq_s32(v); }
-
- intvec operator&(intvec x) const { return vandq_s32(v, x.v); }
- intvec operator|(intvec x) const { return vorrq_s32(v, x.v); }
- intvec operator^(intvec x) const { return veorq_s32(v, x.v); }
-
- intvec& operator&=(intvec const& x) { return *this=*this&x; }
- intvec& operator|=(intvec const& x) { return *this=*this|x; }
- intvec& operator^=(intvec const& x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const
- {
- return vbslq_s32(vreinterpretq_u32_s32(v), x.v, y.v);
- }
-
-
-
- intvec_t lsr(int_t n) const { return lsr(IV(n)); }
- intvec_t rotate(int_t n) const;
- intvec operator>>(int_t n) const { return *this >> IV(n); }
- intvec operator<<(int_t n) const { return *this << IV(n); }
- intvec& operator>>=(int_t n) { return *this=*this>>n; }
- intvec& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec_t lsr(intvec_t n) const
- {
- return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v), (-n).v));
- }
- intvec_t rotate(intvec_t n) const;
- intvec operator>>(intvec n) const
- {
- return vshlq_s32(v, (-n).v);
- }
- intvec operator<<(intvec n) const
- {
- return vshlq_s32(v, n.v);
- }
- intvec& operator>>=(intvec n) { return *this=*this>>n; }
- intvec& operator<<=(intvec n) { return *this=*this<<n; }
-
- intvec_t clz() const { return vclzq_s32(v); }
- intvec_t popcount() const
- {
- return vpaddlq_s16(vpaddlq_s8(vcntq_s8(vreinterpretq_s8_s32(v))));
- }
-
-
-
- boolvec_t operator==(intvec const& x) const { return vceqq_s32(v, x.v); }
- boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
- boolvec_t operator<(intvec const& x) const { return vcltq_s32(v, x.v); }
- boolvec_t operator<=(intvec const& x) const { return vcleq_s32(v, x.v); }
- boolvec_t operator>(intvec const& x) const { return vcgtq_s32(v, x.v); }
- boolvec_t operator>=(intvec const& x) const { return vcgeq_s32(v, x.v); }
-
- intvec_t abs() const { return vabsq_s32(v); }
- boolvec_t isignbit() const
- {
- //return *this < IV(I(0));
- return intvec(vshrq_n_s32(v, FP::bits-1)).as_bool();
- }
- intvec_t max(intvec_t x) const { return vmaxq_s32(v, x.v); }
- intvec_t min(intvec_t x) const { return vminq_s32(v, x.v); }
- };
-
-
-
- template<>
- struct realvec<float,4>: floatprops<float>
- {
- static int const size = 4;
- typedef real_t scalar_t;
- typedef float32x4_t vector_t;
- static int const alignment = sizeof(vector_t);
-
- static char const* name() { return "<NEON:4*float>"; }
- void barrier() { __asm__("": "+w"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(realvec const& x): v(x.v) {}
- // realvec& operator=(realvec const& x) { return v=x.v, *this; }
- realvec(vector_t x): v(x) {}
- realvec(real_t a): v(vdupq_n_f32(a)) {}
- realvec(real_t const* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const
- {
- return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
- }
- realvec_t& set_elt(int n, real_t a)
- {
- return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
- }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return vld1q_f32(p);
- }
- static realvec_t loadu(real_t const* p)
- {
+template <> struct boolvec<float, 4>;
+template <> struct intvec<float, 4>;
+template <> struct realvec<float, 4>;
+
+template <> struct boolvec<float, 4> : floatprops<float> {
+ static int const size = 4;
+ typedef bool scalar_t;
+ typedef uint32x4_t bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+private:
+ // true values are -1, false values are 0
+ static uint_t from_bool(bool a) { return -int_t(a); }
+ static bool to_bool(uint_t a) { return a; }
+
+public:
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v(vdupq_n_u32(from_bool(a))) {}
+ boolvec(bool const *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const {
+ return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+ }
+ boolvec &set_elt(int n, bool a) {
+ return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+ *this;
+ }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec operator!() const { return vmvnq_u32(v); }
+
+ boolvec operator&&(boolvec x) const { return vandq_u32(v, x.v); }
+ boolvec operator||(boolvec x) const { return vorrq_u32(v, x.v); }
+ boolvec operator==(boolvec x) const { return vceqq_u32(v, x.v); }
+ boolvec operator!=(boolvec x) const { return veorq_u32(v, x.v); }
+
+ bool all() const {
+ uint32x2_t x = vpmin_u32(vget_low_u32(v), vget_high_u32(v));
+ uint32x2_t y = vpmin_u32(x, x);
+ uint32_t z = vget_lane_u32(y, 0);
+ return to_bool(z);
+ }
+ bool any() const {
+ uint32x2_t x = vpmax_u32(vget_low_u32(v), vget_high_u32(v));
+ uint32x2_t y = vpmax_u32(x, x);
+ uint32_t z = vget_lane_u32(y, 0);
+ return to_bool(z);
+ }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 4> : floatprops<float> {
+ static int const size = 4;
+ typedef int_t scalar_t;
+ typedef int32x4_t ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x) : v(x) {}
+ intvec(int_t a) : v(vdupq_n_s32(a)) {}
+ intvec(int_t const *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+ static intvec iota() {
+ return vcombine_s32(
+ vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0)),
+ vcreate_s32((uint64_t(3) << uint64_t(32)) | uint64_t(2)));
+ }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const {
+ return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+ }
+ intvec_t &set_elt(int n, int_t a) {
+ return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+ }
+
+ // Vector casts do not change the bit battern
+ boolvec_t as_bool() const { return vreinterpretq_u32_s32(v); }
+ boolvec_t convert_bool() const { return *this != IV(0); }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ intvec operator+() const { return *this; }
+ intvec operator-() const { return vnegq_s32(v); }
+
+ intvec operator+(intvec x) const { return vaddq_s32(v, x.v); }
+ intvec operator-(intvec x) const { return vsubq_s32(v, x.v); }
+ intvec operator*(intvec x) const { return vmulq_s32(v, x.v); }
+
+ intvec &operator+=(intvec const &x) { return *this = *this + x; }
+ intvec &operator-=(intvec const &x) { return *this = *this - x; }
+ intvec &operator*=(intvec const &x) { return *this = *this * x; }
+
+ intvec operator~() const { return vmvnq_s32(v); }
+
+ intvec operator&(intvec x) const { return vandq_s32(v, x.v); }
+ intvec operator|(intvec x) const { return vorrq_s32(v, x.v); }
+ intvec operator^(intvec x) const { return veorq_s32(v, x.v); }
+
+ intvec &operator&=(intvec const &x) { return *this = *this & x; }
+ intvec &operator|=(intvec const &x) { return *this = *this | x; }
+ intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const {
+ return vbslq_s32(vreinterpretq_u32_s32(v), x.v, y.v);
+ }
+
+ intvec_t lsr(int_t n) const { return lsr(IV(n)); }
+ intvec_t rotate(int_t n) const;
+ intvec operator>>(int_t n) const { return *this >> IV(n); }
+ intvec operator<<(int_t n) const { return *this << IV(n); }
+ intvec &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec_t lsr(intvec_t n) const {
+ return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v), (-n).v));
+ }
+ intvec_t rotate(intvec_t n) const;
+ intvec operator>>(intvec n) const { return vshlq_s32(v, (-n).v); }
+ intvec operator<<(intvec n) const { return vshlq_s32(v, n.v); }
+ intvec &operator>>=(intvec n) { return *this = *this >> n; }
+ intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+ intvec_t clz() const { return vclzq_s32(v); }
+ intvec_t popcount() const {
+ return vpaddlq_s16(vpaddlq_s8(vcntq_s8(vreinterpretq_s8_s32(v))));
+ }
+
+ boolvec_t operator==(intvec const &x) const { return vceqq_s32(v, x.v); }
+ boolvec_t operator!=(intvec const &x) const { return !(*this == x); }
+ boolvec_t operator<(intvec const &x) const { return vcltq_s32(v, x.v); }
+ boolvec_t operator<=(intvec const &x) const { return vcleq_s32(v, x.v); }
+ boolvec_t operator>(intvec const &x) const { return vcgtq_s32(v, x.v); }
+ boolvec_t operator>=(intvec const &x) const { return vcgeq_s32(v, x.v); }
+
+ intvec_t abs() const { return vabsq_s32(v); }
+ boolvec_t isignbit() const {
+ // return *this < IV(I(0));
+ return intvec(vshrq_n_s32(v, FP::bits - 1)).as_bool();
+ }
+ intvec_t max(intvec_t x) const { return vmaxq_s32(v, x.v); }
+ intvec_t min(intvec_t x) const { return vminq_s32(v, x.v); }
+};
+
+template <> struct realvec<float, 4> : floatprops<float> {
+ static int const size = 4;
+ typedef real_t scalar_t;
+ typedef float32x4_t vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static char const *name() { return "<NEON:4*float>"; }
+ void barrier() { __asm__("" : "+w"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x) : v(x) {}
+ realvec(real_t a) : v(vdupq_n_f32(a)) {}
+ realvec(real_t const *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const {
+ return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+ }
+ realvec_t &set_elt(int n, real_t a) {
+ return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+ }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return vld1q_f32(p);
+ }
+ static realvec_t loadu(real_t const *p) {
#if defined __ARM_FEATURE_UNALIGNED
- return vld1q_f32(p);
+ return vld1q_f32(p);
#else
- realvec_t r;
- r.set_elt(0, p[0]);
- r.set_elt(1, p[1]);
- r.set_elt(2, p[2]);
- r.set_elt(3, p[3]);
- return r;
+ realvec_t r;
+ r.set_elt(0, p[0]);
+ r.set_elt(1, p[1]);
+ r.set_elt(2, p[2]);
+ r.set_elt(3, p[3]);
+ return r;
#endif
+ }
+ static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
}
- static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff);
- return loadu(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return m.m.ifthen(loada(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return m.m.ifthen(loadu(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff, m);
- return loadu(p+ioff, m);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- vst1q_f32(p, v);
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
}
- void storeu(real_t* p) const
- {
- // Vector stores would require vector loads, which would need to
- // be atomic
+ }
+ realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff, m);
+ return loadu(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ vst1q_f32(p, v);
+ }
+ void storeu(real_t *p) const {
+// Vector stores would require vector loads, which would need to
+// be atomic
#if defined __ARM_FEATURE_UNALIGNED
- vst1q_f32(p, v);
+ vst1q_f32(p, v);
#else
- p[0] = (*this)[0];
- p[1] = (*this)[1];
- p[2] = (*this)[2];
- p[3] = (*this)[3];
+ p[0] = (*this)[0];
+ p[1] = (*this)[1];
+ p[2] = (*this)[2];
+ p[3] = (*this)[3];
#endif
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ if (m.m[0])
+ p[0] = (*this)[0];
+ if (m.m[1])
+ p[1] = (*this)[1];
+ if (m.m[2])
+ p[2] = (*this)[2];
+ if (m.m[3])
+ p[3] = (*this)[3];
}
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff);
- storeu(p+ioff);
- }
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- } else {
- if (m.m[0]) p[0] = (*this)[0];
- if (m.m[1]) p[1] = (*this)[1];
- if (m.m[2]) p[2] = (*this)[2];
- if (m.m[3]) p[3] = (*this)[3];
- }
- }
- void storeu(real_t* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- } else {
- if (m.m[0]) p[0] = (*this)[0];
- if (m.m[1]) p[1] = (*this)[1];
- if (m.m[2]) p[2] = (*this)[2];
- if (m.m[3]) p[3] = (*this)[3];
- }
- }
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff, m);
- storeu(p+ioff, m);
- }
-
-
-
- intvec_t as_int() const { return vreinterpretq_s32_f32(v); }
- intvec_t convert_int() const { return vcvtq_s32_f32(v); }
-
-
-
- realvec operator+() const { return *this; }
- realvec operator-() const { return vnegq_f32(v); }
-
- realvec operator+(realvec x) const { return vaddq_f32(v, x.v); }
- realvec operator-(realvec x) const { return vsubq_f32(v, x.v); }
- realvec operator*(realvec x) const { return vmulq_f32(v, x.v); }
- realvec operator/(realvec x) const { return *this * x.rcp(); }
-
- realvec& operator+=(realvec const& x) { return *this=*this+x; }
- realvec& operator-=(realvec const& x) { return *this=*this-x; }
- realvec& operator*=(realvec const& x) { return *this=*this*x; }
- realvec& operator/=(realvec const& x) { return *this=*this/x; }
-
- real_t maxval() const
- {
- float32x2_t x = vpmax_f32(vget_low_f32(v), vget_high_f32(v));
- float32x2_t y = vpmax_f32(x, x);
- float32_t z = vget_lane_f32(y, 0);
- return z;
- }
- real_t minval() const
- {
- float32x2_t x = vpmin_f32(vget_low_f32(v), vget_high_f32(v));
- float32x2_t y = vpmin_f32(x, x);
- float32_t z = vget_lane_f32(y, 0);
- return z;
- }
- real_t prod() const
- {
- // TODO: multiply pairwise with 2-vectors
- return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
- }
- real_t sum() const
- {
- float32x2_t x = vpadd_f32(vget_low_f32(v), vget_high_f32(v));
- float32x2_t y = vpadd_f32(x, x);
- float32_t z = vget_lane_f32(y, 0);
- return z;
- }
-
-
-
- boolvec_t operator==(realvec const& x) const { return vceqq_f32(v, x.v); }
- boolvec_t operator!=(realvec const& x) const { return !(*this == x); }
- boolvec_t operator<(realvec const& x) const { return vcltq_f32(v, x.v); }
- boolvec_t operator<=(realvec const& x) const { return vcleq_f32(v, x.v); }
- boolvec_t operator>(realvec const& x) const { return vcgtq_f32(v, x.v); }
- boolvec_t operator>=(realvec const& x) const { return vcgeq_f32(v, x.v); }
-
-
-
- realvec acos() const { return MF::vml_acos(*this); }
- realvec acosh() const { return MF::vml_acosh(*this); }
- realvec asin() const { return MF::vml_asin(*this); }
- realvec asinh() const { return MF::vml_asinh(*this); }
- realvec atan() const { return MF::vml_atan(*this); }
- realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
- realvec atanh() const { return MF::vml_atanh(*this); }
- realvec cbrt() const { return MF::vml_cbrt(*this); }
- realvec ceil() const
- {
- // return vrndpq_f32(v);
- return MF::vml_ceil(*this);
- }
- realvec copysign(realvec y) const
- {
- return vbslq_f32(vdupq_n_u32(FP::signbit_mask), y.v, v);
- }
- realvec cos() const { return MF::vml_cos(*this); }
- realvec cosh() const { return MF::vml_cosh(*this); }
- realvec exp() const { return MF::vml_exp(*this); }
- realvec exp10() const { return MF::vml_exp10(*this); }
- realvec exp2() const { return MF::vml_exp2(*this); }
- realvec expm1() const { return MF::vml_expm1(*this); }
- realvec fabs() const { return vabsq_f32(v); }
- realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
- realvec floor() const
- {
- // return vrndmq_f32(v);
- return MF::vml_floor(*this);
- }
- realvec_t fma(realvec_t y, realvec_t z) const
- {
- return vfmaq_f32(z.v, v, y.v);
- }
- realvec fmax(realvec y) const { return vmaxq_f32(v, y.v); }
- realvec fmin(realvec y) const { return vminq_f32(v, y.v); }
- realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
- realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
- realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
- intvec_t ilogb() const { return MF::vml_ilogb(*this); }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const { return MF::vml_isnan(*this); }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
- realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
- realvec log() const { return MF::vml_log(*this); }
- realvec log10() const { return MF::vml_log10(*this); }
- realvec log1p() const { return MF::vml_log1p(*this); }
- realvec log2() const { return MF::vml_log2(*this); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return vmlaq_f32(z.v, v, y.v);
- }
- realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
- realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
- realvec rcp() const
- {
- realvec r = vrecpeq_f32(v);
- r *= vrecpsq_f32(v, r);
- r *= vrecpsq_f32(v, r);
- return r;
- }
- realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
- realvec rint() const
- {
- // return vrndnq_f32(v);
- return MF::vml_rint(*this);
- }
- realvec round() const
- {
- // return vrndaq_f32(v);
- return MF::vml_round(*this);
- }
- realvec rsqrt() const
- {
- realvec r = vrsqrteq_f32(v);
- r *= vrsqrtsq_f32(v, r*r);
- r *= vrsqrtsq_f32(v, r*r);
- return r;
- }
- boolvec_t signbit() const { return MF::vml_signbit(*this); }
- realvec sin() const { return MF::vml_sin(*this); }
- realvec sinh() const { return MF::vml_sinh(*this); }
- realvec sqrt() const { return *this * rsqrt(); }
- realvec tan() const { return MF::vml_tan(*this); }
- realvec tanh() const { return MF::vml_tanh(*this); }
- realvec trunc() const
- {
- // return vrndq_f32(v);
- return MF::vml_trunc(*this);
+ }
+ void storeu(real_t *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ if (m.m[0])
+ p[0] = (*this)[0];
+ if (m.m[1])
+ p[1] = (*this)[1];
+ if (m.m[2])
+ p[2] = (*this)[2];
+ if (m.m[3])
+ p[3] = (*this)[3];
}
- };
-
-
-
- // boolvec definitions
-
- inline intvec<float,4> boolvec<float,4>::as_int() const
- {
- return vreinterpretq_s32_u32(v);
- }
-
- inline intvec<float,4> boolvec<float,4>::convert_int() const
- {
- return - as_int();
- }
-
- inline
- boolvec<float,4> boolvec<float,4>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return vbslq_u32(v, x.v, y.v);
- }
-
- inline intvec<float,4> boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const
- {
- return vbslq_s32(v, x.v, y.v);
- }
-
- inline
- realvec<float,4> boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const
- {
- return vbslq_f32(v, x.v, y.v);
- }
-
-
-
- // intvec definitions
-
- inline realvec<float,4> intvec<float,4>::as_float() const
- {
- return vreinterpretq_f32_s32(v);
- }
-
- inline realvec<float,4> intvec<float,4>::convert_float() const
- {
- return vcvtq_f32_s32(v);
- }
-
- inline intvec<float,4> intvec<float,4>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
- inline intvec<float,4> intvec<float,4>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff, m);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return vreinterpretq_s32_f32(v); }
+ intvec_t convert_int() const { return vcvtq_s32_f32(v); }
+
+ realvec operator+() const { return *this; }
+ realvec operator-() const { return vnegq_f32(v); }
+
+ realvec operator+(realvec x) const { return vaddq_f32(v, x.v); }
+ realvec operator-(realvec x) const { return vsubq_f32(v, x.v); }
+ realvec operator*(realvec x) const { return vmulq_f32(v, x.v); }
+ realvec operator/(realvec x) const { return *this * x.rcp(); }
+
+ realvec &operator+=(realvec const &x) { return *this = *this + x; }
+ realvec &operator-=(realvec const &x) { return *this = *this - x; }
+ realvec &operator*=(realvec const &x) { return *this = *this * x; }
+ realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+ real_t maxval() const {
+ float32x2_t x = vpmax_f32(vget_low_f32(v), vget_high_f32(v));
+ float32x2_t y = vpmax_f32(x, x);
+ float32_t z = vget_lane_f32(y, 0);
+ return z;
+ }
+ real_t minval() const {
+ float32x2_t x = vpmin_f32(vget_low_f32(v), vget_high_f32(v));
+ float32x2_t y = vpmin_f32(x, x);
+ float32_t z = vget_lane_f32(y, 0);
+ return z;
+ }
+ real_t prod() const {
+ // TODO: multiply pairwise with 2-vectors
+ return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+ }
+ real_t sum() const {
+ float32x2_t x = vpadd_f32(vget_low_f32(v), vget_high_f32(v));
+ float32x2_t y = vpadd_f32(x, x);
+ float32_t z = vget_lane_f32(y, 0);
+ return z;
+ }
+
+ boolvec_t operator==(realvec const &x) const { return vceqq_f32(v, x.v); }
+ boolvec_t operator!=(realvec const &x) const { return !(*this == x); }
+ boolvec_t operator<(realvec const &x) const { return vcltq_f32(v, x.v); }
+ boolvec_t operator<=(realvec const &x) const { return vcleq_f32(v, x.v); }
+ boolvec_t operator>(realvec const &x) const { return vcgtq_f32(v, x.v); }
+ boolvec_t operator>=(realvec const &x) const { return vcgeq_f32(v, x.v); }
+
+ realvec acos() const { return MF::vml_acos(*this); }
+ realvec acosh() const { return MF::vml_acosh(*this); }
+ realvec asin() const { return MF::vml_asin(*this); }
+ realvec asinh() const { return MF::vml_asinh(*this); }
+ realvec atan() const { return MF::vml_atan(*this); }
+ realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+ realvec atanh() const { return MF::vml_atanh(*this); }
+ realvec cbrt() const { return MF::vml_cbrt(*this); }
+ realvec ceil() const {
+ // return vrndpq_f32(v);
+ return MF::vml_ceil(*this);
+ }
+ realvec copysign(realvec y) const {
+ return vbslq_f32(vdupq_n_u32(FP::signbit_mask), y.v, v);
+ }
+ realvec cos() const { return MF::vml_cos(*this); }
+ realvec cosh() const { return MF::vml_cosh(*this); }
+ realvec exp() const { return MF::vml_exp(*this); }
+ realvec exp10() const { return MF::vml_exp10(*this); }
+ realvec exp2() const { return MF::vml_exp2(*this); }
+ realvec expm1() const { return MF::vml_expm1(*this); }
+ realvec fabs() const { return vabsq_f32(v); }
+ realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+ realvec floor() const {
+ // return vrndmq_f32(v);
+ return MF::vml_floor(*this);
+ }
+ realvec_t fma(realvec_t y, realvec_t z) const {
+ return vfmaq_f32(z.v, v, y.v);
+ }
+ realvec fmax(realvec y) const { return vmaxq_f32(v, y.v); }
+ realvec fmin(realvec y) const { return vminq_f32(v, y.v); }
+ realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+ realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+ realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const { return MF::vml_isnan(*this); }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec log() const { return MF::vml_log(*this); }
+ realvec log10() const { return MF::vml_log10(*this); }
+ realvec log1p() const { return MF::vml_log1p(*this); }
+ realvec log2() const { return MF::vml_log2(*this); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return vmlaq_f32(z.v, v, y.v);
+ }
+ realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+ realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+ realvec rcp() const {
+ realvec r = vrecpeq_f32(v);
+ r *= vrecpsq_f32(v, r);
+ r *= vrecpsq_f32(v, r);
+ return r;
+ }
+ realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+ realvec rint() const {
+ // return vrndnq_f32(v);
+ return MF::vml_rint(*this);
+ }
+ realvec round() const {
+ // return vrndaq_f32(v);
+ return MF::vml_round(*this);
+ }
+ realvec rsqrt() const {
+ realvec r = vrsqrteq_f32(v);
+ r *= vrsqrtsq_f32(v, r * r);
+ r *= vrsqrtsq_f32(v, r * r);
+ return r;
+ }
+ boolvec_t signbit() const { return MF::vml_signbit(*this); }
+ realvec sin() const { return MF::vml_sin(*this); }
+ realvec sinh() const { return MF::vml_sinh(*this); }
+ realvec sqrt() const { return *this * rsqrt(); }
+ realvec tan() const { return MF::vml_tan(*this); }
+ realvec tanh() const { return MF::vml_tanh(*this); }
+ realvec trunc() const {
+ // return vrndq_f32(v);
+ return MF::vml_trunc(*this);
+ }
+};
+
+// boolvec definitions
+
+inline intvec<float, 4> boolvec<float, 4>::as_int() const {
+ return vreinterpretq_s32_u32(v);
+}
+
+inline intvec<float, 4> boolvec<float, 4>::convert_int() const {
+ return -as_int();
+}
+
+inline boolvec<float, 4> boolvec<float, 4>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return vbslq_u32(v, x.v, y.v);
+}
+
+inline intvec<float, 4> boolvec<float, 4>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return vbslq_s32(v, x.v, y.v);
+}
+
+inline realvec<float, 4> boolvec<float, 4>::ifthen(realvec_t x,
+ realvec_t y) const {
+ return vbslq_f32(v, x.v, y.v);
+}
+
+// intvec definitions
+
+inline realvec<float, 4> intvec<float, 4>::as_float() const {
+ return vreinterpretq_f32_s32(v);
+}
+
+inline realvec<float, 4> intvec<float, 4>::convert_float() const {
+ return vcvtq_f32_s32(v);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_NEON_FLOAT4_H
+#endif // #ifndef VEC_NEON_FLOAT4_H
diff --git a/vec_pseudo.h b/vec_pseudo.h
index 2aafc23..c4cbbc1 100644
--- a/vec_pseudo.h
+++ b/vec_pseudo.h
@@ -12,1668 +12,1492 @@
#include <climits>
#include <cstdlib>
#ifndef VML_NO_IOSTREAM
-# include <sstream>
+#include <sstream>
#endif
#include <string>
+namespace vecmathlib {
+template <typename T, int N> struct boolpseudovec;
+template <typename T, int N> struct intpseudovec;
+template <typename T, int N> struct realpseudovec;
-namespace vecmathlib {
-
- template<typename T, int N> struct boolpseudovec;
- template<typename T, int N> struct intpseudovec;
- template<typename T, int N> struct realpseudovec;
-
-
-
- template<typename T, int N>
- struct boolpseudovec: floatprops<T>
- {
- typedef typename floatprops<T>::int_t int_t;
- typedef typename floatprops<T>::uint_t uint_t;
- typedef typename floatprops<T>::real_t real_t;
-
- static int const size = N;
- typedef bool scalar_t;
- typedef bool bvector_t[size];
- static int const alignment = sizeof(bool);
-
- typedef boolpseudovec boolvec_t;
- typedef intpseudovec<real_t, size> intvec_t;
- typedef realpseudovec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolpseudovec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolpseudovec(boolpseudovec const& x): v(x.v) {}
- // boolpseudovec& operator=(boolpseudovec const& x) { return v=x.v, *this; }
- boolpseudovec(bool a) { for (int d=0; d<size; ++d) v[d]=a; }
- boolpseudovec(bool const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-
- bool operator[](int n) const { return v[n]; }
- boolvec_t& set_elt(int n, bool a) { return v[n]=a, *this; }
-
-
-
- intvec_t as_int() const; // defined after intpseudovec
- intvec_t convert_int() const; // defined after intpseudovec
-
-
-
- boolvec_t operator!() const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = !v[d];
- return res;
- }
-
- boolvec_t operator&&(boolvec_t x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] && x.v[d];
- return res;
- }
- boolvec_t operator||(boolvec_t x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] || x.v[d];
- return res;
- }
- boolvec_t operator==(boolvec_t x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
- return res;
- }
- boolvec_t operator!=(boolvec_t x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
- return res;
- }
-
- bool all() const
- {
- bool res = v[0];
- for (int d=1; d<size; ++d) res = res && v[d];
- return res;
- }
- bool any() const
- {
- bool res = v[0];
- for (int d=1; d<size; ++d) res = res || v[d];
- return res;
- }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intpseudovec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realpseudovec
- };
-
-
-
- template<typename T, int N>
- struct intpseudovec: floatprops<T>
- {
- typedef typename floatprops<T>::int_t int_t;
- typedef typename floatprops<T>::uint_t uint_t;
- typedef typename floatprops<T>::real_t real_t;
-
- static int const size = N;
- typedef int_t scalar_t;
- typedef int_t ivector_t[size];
- static int const alignment = sizeof(int_t);
-
- typedef boolpseudovec<real_t, size> boolvec_t;
- typedef intpseudovec intvec_t;
- typedef realpseudovec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intpseudovec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intpseudovec(intpseudovec const& x): v(x.v) {}
- // intpseudovec& operator=(intpseudovec const& x) { return v=x.v, *this; }
- intpseudovec(int_t a) { for (int d=0; d<size; ++d) v[d]=a; }
- intpseudovec(int_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
- static intvec_t iota()
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d]=d;
- return res;
- }
-
- int_t operator[](int n) const { return v[n]; }
- intvec_t& set_elt(int n, int_t a) { return v[n]=a, *this; }
-
-
-
- boolvec_t as_bool() const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d]=v[d];
- return res;
- }
- boolvec_t convert_bool() const
- {
- // Result: convert_bool(0)=false, convert_bool(else)=true
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d];
- return res;
- }
- realvec_t as_float() const; // defined after realpseudovec
- realvec_t convert_float() const; // defined after realpseudovec
-
-
-
- intvec_t operator+() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = + v[d];
- return res;
- }
- intvec_t operator-() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = - v[d];
- return res;
- }
-
- intvec_t& operator+=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] += x.v[d];
- return *this;
- }
- intvec_t& operator-=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] -= x.v[d];
- return *this;
- }
- intvec_t& operator*=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] *= x.v[d];
- return *this;
- }
- intvec_t& operator/=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] /= x.v[d];
- return *this;
- }
- intvec_t& operator%=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] %= x.v[d];
- return *this;
- }
-
- intvec_t operator+(intvec_t x) const
- {
- intvec_t res = *this;
- return res += x;
- }
- intvec_t operator-(intvec_t x) const
- {
- intvec_t res = *this;
- return res -= x;
- }
- intvec_t operator*(intvec_t x) const
- {
- intvec_t res = *this;
- return res *= x;
- }
- intvec_t operator/(intvec_t x) const
- {
- intvec_t res = *this;
- return res /= x;
- }
- intvec_t operator%(intvec_t x) const
- {
- intvec_t res = *this;
- return res %= x;
- }
-
-
-
- intvec_t operator~() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = ~ v[d];
- return res;
- }
-
- intvec_t& operator&=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] &= x.v[d];
- return *this;
- }
- intvec_t& operator|=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] |= x.v[d];
- return *this;
- }
- intvec_t& operator^=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] ^= x.v[d];
- return *this;
- }
-
- intvec_t operator&(intvec_t x) const
- {
- intvec_t res = *this;
- return res &= x;
- }
- intvec_t operator|(intvec_t x) const
- {
- intvec_t res = *this;
- return res |= x;
- }
- intvec_t operator^(intvec_t x) const
- {
- intvec_t res = *this;
- return res ^= x;
- }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const;
-
-
-
- intvec_t lsr(int_t n) const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n));
- return res;
- }
- intvec_t rotate(int_t n) const;
- intvec_t& operator>>=(int_t n)
- {
- for (int d=0; d<size; ++d) v[d] >>= n;
- return *this;
- }
- intvec_t& operator<<=(int_t n)
- {
- for (int d=0; d<size; ++d) v[d] <<= n;
- return *this;
- }
- intvec_t operator>>(int_t n) const
- {
- intvec_t res = *this;
- return res >>= n;
- }
- intvec_t operator<<(int_t n) const
- {
- intvec_t res = *this;
- return res <<= n;
- }
-
- intvec_t lsr(intvec_t n) const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n.v[d]));
- return res;
- }
- intvec_t rotate(intvec_t n) const;
- intvec_t& operator>>=(intvec_t n)
- {
- for (int d=0; d<size; ++d) v[d] >>= n.v[d];
- return *this;
- }
- intvec_t& operator<<=(intvec_t n)
- {
- for (int d=0; d<size; ++d) v[d] <<= n.v[d];
- return *this;
- }
- intvec_t operator>>(intvec_t n) const
- {
- intvec_t res = *this;
- return res >>= n;
- }
- intvec_t operator<<(intvec_t n) const
- {
- intvec_t res = *this;
- return res <<= n;
- }
-
- intvec_t clz() const
- {
- intvec_t res;
+template <typename T, int N> struct boolpseudovec : floatprops<T> {
+ typedef typename floatprops<T>::int_t int_t;
+ typedef typename floatprops<T>::uint_t uint_t;
+ typedef typename floatprops<T>::real_t real_t;
+
+ static int const size = N;
+ typedef bool scalar_t;
+ typedef bool bvector_t[size];
+ static int const alignment = sizeof(bool);
+
+ typedef boolpseudovec boolvec_t;
+ typedef intpseudovec<real_t, size> intvec_t;
+ typedef realpseudovec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolpseudovec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolpseudovec(boolpseudovec const& x): v(x.v) {}
+ // boolpseudovec& operator=(boolpseudovec const& x) { return v=x.v, *this; }
+ boolpseudovec(bool a) {
+ for (int d = 0; d < size; ++d)
+ v[d] = a;
+ }
+ boolpseudovec(bool const *as) {
+ for (int d = 0; d < size; ++d)
+ v[d] = as[d];
+ }
+
+ bool operator[](int n) const { return v[n]; }
+ boolvec_t &set_elt(int n, bool a) { return v[n] = a, *this; }
+
+ intvec_t as_int() const; // defined after intpseudovec
+ intvec_t convert_int() const; // defined after intpseudovec
+
+ boolvec_t operator!() const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = !v[d];
+ return res;
+ }
+
+ boolvec_t operator&&(boolvec_t x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] && x.v[d];
+ return res;
+ }
+ boolvec_t operator||(boolvec_t x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] || x.v[d];
+ return res;
+ }
+ boolvec_t operator==(boolvec_t x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] == x.v[d];
+ return res;
+ }
+ boolvec_t operator!=(boolvec_t x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] != x.v[d];
+ return res;
+ }
+
+ bool all() const {
+ bool res = v[0];
+ for (int d = 1; d < size; ++d)
+ res = res && v[d];
+ return res;
+ }
+ bool any() const {
+ bool res = v[0];
+ for (int d = 1; d < size; ++d)
+ res = res || v[d];
+ return res;
+ }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intpseudovec
+ realvec_t ifthen(realvec_t x,
+ realvec_t y) const; // defined after realpseudovec
+};
+
+template <typename T, int N> struct intpseudovec : floatprops<T> {
+ typedef typename floatprops<T>::int_t int_t;
+ typedef typename floatprops<T>::uint_t uint_t;
+ typedef typename floatprops<T>::real_t real_t;
+
+ static int const size = N;
+ typedef int_t scalar_t;
+ typedef int_t ivector_t[size];
+ static int const alignment = sizeof(int_t);
+
+ typedef boolpseudovec<real_t, size> boolvec_t;
+ typedef intpseudovec intvec_t;
+ typedef realpseudovec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intpseudovec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intpseudovec(intpseudovec const& x): v(x.v) {}
+ // intpseudovec& operator=(intpseudovec const& x) { return v=x.v, *this; }
+ intpseudovec(int_t a) {
+ for (int d = 0; d < size; ++d)
+ v[d] = a;
+ }
+ intpseudovec(int_t const *as) {
+ for (int d = 0; d < size; ++d)
+ v[d] = as[d];
+ }
+ static intvec_t iota() {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = d;
+ return res;
+ }
+
+ int_t operator[](int n) const { return v[n]; }
+ intvec_t &set_elt(int n, int_t a) { return v[n] = a, *this; }
+
+ boolvec_t as_bool() const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d];
+ return res;
+ }
+ boolvec_t convert_bool() const {
+ // Result: convert_bool(0)=false, convert_bool(else)=true
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d];
+ return res;
+ }
+ realvec_t as_float() const; // defined after realpseudovec
+ realvec_t convert_float() const; // defined after realpseudovec
+
+ intvec_t operator+() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = +v[d];
+ return res;
+ }
+ intvec_t operator-() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = -v[d];
+ return res;
+ }
+
+ intvec_t &operator+=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] += x.v[d];
+ return *this;
+ }
+ intvec_t &operator-=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] -= x.v[d];
+ return *this;
+ }
+ intvec_t &operator*=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] *= x.v[d];
+ return *this;
+ }
+ intvec_t &operator/=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] /= x.v[d];
+ return *this;
+ }
+ intvec_t &operator%=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] %= x.v[d];
+ return *this;
+ }
+
+ intvec_t operator+(intvec_t x) const {
+ intvec_t res = *this;
+ return res += x;
+ }
+ intvec_t operator-(intvec_t x) const {
+ intvec_t res = *this;
+ return res -= x;
+ }
+ intvec_t operator*(intvec_t x) const {
+ intvec_t res = *this;
+ return res *= x;
+ }
+ intvec_t operator/(intvec_t x) const {
+ intvec_t res = *this;
+ return res /= x;
+ }
+ intvec_t operator%(intvec_t x) const {
+ intvec_t res = *this;
+ return res %= x;
+ }
+
+ intvec_t operator~() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = ~v[d];
+ return res;
+ }
+
+ intvec_t &operator&=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] &= x.v[d];
+ return *this;
+ }
+ intvec_t &operator|=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] |= x.v[d];
+ return *this;
+ }
+ intvec_t &operator^=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] ^= x.v[d];
+ return *this;
+ }
+
+ intvec_t operator&(intvec_t x) const {
+ intvec_t res = *this;
+ return res &= x;
+ }
+ intvec_t operator|(intvec_t x) const {
+ intvec_t res = *this;
+ return res |= x;
+ }
+ intvec_t operator^(intvec_t x) const {
+ intvec_t res = *this;
+ return res ^= x;
+ }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+ intvec_t lsr(int_t n) const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = I(U(v[d]) >> U(n));
+ return res;
+ }
+ intvec_t rotate(int_t n) const;
+ intvec_t &operator>>=(int_t n) {
+ for (int d = 0; d < size; ++d)
+ v[d] >>= n;
+ return *this;
+ }
+ intvec_t &operator<<=(int_t n) {
+ for (int d = 0; d < size; ++d)
+ v[d] <<= n;
+ return *this;
+ }
+ intvec_t operator>>(int_t n) const {
+ intvec_t res = *this;
+ return res >>= n;
+ }
+ intvec_t operator<<(int_t n) const {
+ intvec_t res = *this;
+ return res <<= n;
+ }
+
+ intvec_t lsr(intvec_t n) const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = I(U(v[d]) >> U(n.v[d]));
+ return res;
+ }
+ intvec_t rotate(intvec_t n) const;
+ intvec_t &operator>>=(intvec_t n) {
+ for (int d = 0; d < size; ++d)
+ v[d] >>= n.v[d];
+ return *this;
+ }
+ intvec_t &operator<<=(intvec_t n) {
+ for (int d = 0; d < size; ++d)
+ v[d] <<= n.v[d];
+ return *this;
+ }
+ intvec_t operator>>(intvec_t n) const {
+ intvec_t res = *this;
+ return res >>= n;
+ }
+ intvec_t operator<<(intvec_t n) const {
+ intvec_t res = *this;
+ return res <<= n;
+ }
+
+ intvec_t clz() const {
+ intvec_t res;
#if defined __clang__ || defined __gcc__
- for (int d=0; d<size; ++d) {
- if (v[d] == 0) {
- res.v[d] = CHAR_BIT * sizeof v[d];
+ for (int d = 0; d < size; ++d) {
+ if (v[d] == 0) {
+ res.v[d] = CHAR_BIT * sizeof v[d];
+ } else {
+ if (sizeof v[d] == sizeof(long long)) {
+ res.v[d] = __builtin_clzll(v[d]);
+ } else if (sizeof v[d] == sizeof(long)) {
+ res.v[d] = __builtin_clzl(v[d]);
+ } else if (sizeof v[d] == sizeof(int)) {
+ res.v[d] = __builtin_clz(v[d]);
+ } else if (sizeof v[d] == sizeof(short)) {
+ res.v[d] = __builtin_clzs(v[d]);
+ } else if (sizeof v[d] == sizeof(char)) {
+ res.v[d] = __builtin_clzs((unsigned short)(unsigned char)v[d]) -
+ CHAR_BIT * (sizeof(short) - sizeof(char));
} else {
- if (sizeof v[d] == sizeof(long long)) {
- res.v[d] = __builtin_clzll(v[d]);
- } else if (sizeof v[d] == sizeof(long)) {
- res.v[d] = __builtin_clzl(v[d]);
- } else if (sizeof v[d] == sizeof(int)) {
- res.v[d] = __builtin_clz(v[d]);
- } else if (sizeof v[d] == sizeof(short)) {
- res.v[d] = __builtin_clzs(v[d]);
- } else if (sizeof v[d] == sizeof(char)) {
- res.v[d] =
- __builtin_clzs((unsigned short)(unsigned char)v[d]) -
- CHAR_BIT * (sizeof(short) - sizeof(char));
- } else {
- __builtin_unreachable();
- }
+ __builtin_unreachable();
}
}
+ }
#else
- res = MF::vml_clz(*this);
+ res = MF::vml_clz(*this);
#endif
- return res;
- }
- intvec_t popcount() const
- {
- intvec_t res;
+ return res;
+ }
+ intvec_t popcount() const {
+ intvec_t res;
#if defined __clang__ || defined __gcc__
- if (sizeof(int_t) == sizeof(long long)) {
- for (int d=0; d<size; ++d) res.v[d] = __builtin_popcountll(v[d]);
- } else if (sizeof(int_t) == sizeof(long)) {
- for (int d=0; d<size; ++d) res.v[d] = __builtin_popcountl(v[d]);
- } else if (sizeof(int_t) <= sizeof(int)) {
- for (int d=0; d<size; ++d) res.v[d] = __builtin_popcount(v[d]);
- } else {
- __builtin_unreachable();
- }
+ if (sizeof(int_t) == sizeof(long long)) {
+ for (int d = 0; d < size; ++d)
+ res.v[d] = __builtin_popcountll(v[d]);
+ } else if (sizeof(int_t) == sizeof(long)) {
+ for (int d = 0; d < size; ++d)
+ res.v[d] = __builtin_popcountl(v[d]);
+ } else if (sizeof(int_t) <= sizeof(int)) {
+ for (int d = 0; d < size; ++d)
+ res.v[d] = __builtin_popcount(v[d]);
+ } else {
+ __builtin_unreachable();
+ }
#else
- res = MF::vml_popcount(*this);
+ res = MF::vml_popcount(*this);
#endif
- return res;
- }
-
-
-
- boolvec_t operator==(intvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
- return res;
- }
- boolvec_t operator!=(intvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
- return res;
- }
- boolvec_t operator<(intvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d];
- return res;
- }
- boolvec_t operator<=(intvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d];
- return res;
- }
- boolvec_t operator>(intvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d];
- return res;
- }
- boolvec_t operator>=(intvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d];
- return res;
- }
-
- intvec_t abs() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = std::abs(v[d]);
- return res;
- }
-
- boolvec_t isignbit() const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] < 0;
- return res;
- }
-
- intvec_t max(intvec_t x) const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = std::max(v[d], x.v[d]);
- return res;
- }
-
- intvec_t min(intvec_t x) const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = std::min(v[d], x.v[d]);
- return res;
- }
- };
-
-
-
- template<typename T, int N>
- struct realpseudovec: floatprops<T>
- {
- typedef typename floatprops<T>::int_t int_t;
- typedef typename floatprops<T>::uint_t uint_t;
- typedef typename floatprops<T>::real_t real_t;
-
- static int const size = N;
- typedef real_t scalar_t;
- typedef real_t vector_t[size];
- static int const alignment = sizeof(real_t);
-
+ return res;
+ }
+
+ boolvec_t operator==(intvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] == x.v[d];
+ return res;
+ }
+ boolvec_t operator!=(intvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] != x.v[d];
+ return res;
+ }
+ boolvec_t operator<(intvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] < x.v[d];
+ return res;
+ }
+ boolvec_t operator<=(intvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] <= x.v[d];
+ return res;
+ }
+ boolvec_t operator>(intvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] > x.v[d];
+ return res;
+ }
+ boolvec_t operator>=(intvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] >= x.v[d];
+ return res;
+ }
+
+ intvec_t abs() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = std::abs(v[d]);
+ return res;
+ }
+
+ boolvec_t isignbit() const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] < 0;
+ return res;
+ }
+
+ intvec_t max(intvec_t x) const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = std::max(v[d], x.v[d]);
+ return res;
+ }
+
+ intvec_t min(intvec_t x) const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = std::min(v[d], x.v[d]);
+ return res;
+ }
+};
+
+template <typename T, int N> struct realpseudovec : floatprops<T> {
+ typedef typename floatprops<T>::int_t int_t;
+ typedef typename floatprops<T>::uint_t uint_t;
+ typedef typename floatprops<T>::real_t real_t;
+
+ static int const size = N;
+ typedef real_t scalar_t;
+ typedef real_t vector_t[size];
+ static int const alignment = sizeof(real_t);
+
#ifndef VML_NO_IOSTREAM
- static char const* name()
- {
- static std::string name_;
- if (name_.empty()) {
- std::stringstream buf;
- buf << "<libm:" << N << "*" << FP::name() << ">";
- name_ = buf.str();
- }
- return name_.c_str();
+ static char const *name() {
+ static std::string name_;
+ if (name_.empty()) {
+ std::stringstream buf;
+ buf << "<libm:" << N << "*" << FP::name() << ">";
+ name_ = buf.str();
}
+ return name_.c_str();
+ }
#endif
- void barrier()
- {
+ void barrier() {
#if defined __GNUC__ && !defined __clang__ && !defined __ICC
- // GCC crashes when +X is used as constraint
-# if defined __SSE2__
- for (int d=0; d<size; ++d) __asm__("": "+x"(v[d]));
-# elif defined __PPC64__ // maybe also __PPC__
- for (int d=0; d<size; ++d) __asm__("": "+f"(v[d]));
-# elif defined __arm__
- for (int d=0; d<size; ++d) __asm__("": "+w"(v[d]));
-# else
-# error "Floating point barrier undefined on this architecture"
-# endif
+// GCC crashes when +X is used as constraint
+#if defined __SSE2__
+ for (int d = 0; d < size; ++d)
+ __asm__("" : "+x"(v[d]));
+#elif defined __PPC64__ // maybe also __PPC__
+ for (int d = 0; d < size; ++d)
+ __asm__("" : "+f"(v[d]));
+#elif defined __arm__
+ for (int d = 0; d < size; ++d)
+ __asm__("" : "+w"(v[d]));
+#else
+#error "Floating point barrier undefined on this architecture"
+#endif
#elif defined __clang__
- for (int d=0; d<size; ++d) __asm__("": "+X"(v[d]));
+ for (int d = 0; d < size; ++d)
+ __asm__("" : "+X"(v[d]));
#elif defined __ICC
- for (int d=0; d<size; ++d) {
- real_t tmp = v[d];
- __asm__("": "+X"(tmp));
- v[d] = tmp;
- }
+ for (int d = 0; d < size; ++d) {
+ real_t tmp = v[d];
+ __asm__("" : "+X"(tmp));
+ v[d] = tmp;
+ }
#elif defined __IBMCPP__
- for (int d=0; d<size; ++d) __asm__("": "+f"(v[d]));
+ for (int d = 0; d < size; ++d)
+ __asm__("" : "+f"(v[d]));
#else
-# error "Floating point barrier undefined on this architecture"
+#error "Floating point barrier undefined on this architecture"
#endif
- }
-
- typedef boolpseudovec<real_t, size> boolvec_t;
- typedef intpseudovec<real_t, size> intvec_t;
- typedef realpseudovec realvec_t;
-
- private:
- boolvec_t mapb(bool f(real_t)) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
- return res;
- }
- intvec_t map(int_t f(real_t)) const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
- return res;
- }
- realvec_t map(real_t f(real_t)) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = f(v[d]);
- return res;
- }
- realvec_t map(real_t f(real_t, int_t), intvec_t x) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]);
- return res;
- }
- realvec_t map(real_t f(real_t, real_t), realvec_t x) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]);
- return res;
- }
- realvec_t map(real_t f(real_t, real_t, real_t),
- realvec_t x, realvec_t y) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d], y.v[d]);
- return res;
- }
- public:
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realpseudovec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realpseudovec(realpseudovec const& x): v(x.v) {}
- // realpseudovec& operator=(realpseudovec const& x) { return v=x.v, *this; }
- realpseudovec(real_t a) { for (int d=0; d<size; ++d) v[d]=a; }
- realpseudovec(real_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-
- real_t operator[](int n) const { return v[n]; }
- realvec_t& set_elt(int n, real_t a) { return v[n]=a, *this; }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return loadu(p);
- }
- static realvec_t loadu(real_t const* p)
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = p[d];
- return res;
- }
- static realvec_t loadu(real_t const* p, size_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return loadu(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- return m.m.ifthen(loada(p), *this);
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- return m.m.ifthen(loadu(p), *this);
- }
- realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
- {
- return m.m.ifthen(loadu(p, ioff), *this);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storeu(p);
- }
- void storeu(real_t* p) const
- {
- for (int d=0; d<size; ++d) p[d] = v[d];
- }
- void storeu(real_t* p, size_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storeu(p+ioff);
- }
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storeu(p, m);
- }
- void storeu(real_t* p, mask_t const& m) const
- {
- for (int d=0; d<size; ++d) if (m.m[d]) p[d] = v[d];
- }
- void storeu(real_t* p, size_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storeu(p+ioff, m);
- }
-
-
-
- intvec_t as_int() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = FP::as_int(v[d]);
- return res;
- }
- intvec_t convert_int() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = FP::convert_int(v[d]);
- return res;
- }
-
-
-
- realvec_t operator+() const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = + v[d];
- return res;
- }
- realvec_t operator-() const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = - v[d];
- return res;
- }
-
- realvec_t& operator+=(realvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] += x.v[d];
- return *this;
- }
- realvec_t& operator-=(realvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] -= x.v[d];
- return *this;
- }
- realvec_t& operator*=(realvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] *= x.v[d];
- return *this;
- }
- realvec_t& operator/=(realvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] /= x.v[d];
- return *this;
- }
-
- realvec_t operator+(realvec_t x) const
- {
- realvec_t res = *this;
- return res += x;
- }
- realvec_t operator-(realvec_t x) const
- {
- realvec_t res = *this;
- return res -= x;
- }
- realvec_t operator*(realvec_t x) const
- {
- realvec_t res = *this;
- return res *= x;
- }
- realvec_t operator/(realvec_t x) const
- {
- realvec_t res = *this;
- return res /= x;
- }
-
- real_t maxval() const
- {
- real_t res = v[0];
- for (int d=1; d<size; ++d) res = vml_std::fmax(res, v[d]);
- return res;
- }
- real_t minval() const
- {
- real_t res = v[0];
- for (int d=1; d<size; ++d) res = vml_std::fmin(res, v[d]);
- return res;
- }
- real_t prod() const
- {
- real_t res = v[0];
- for (int d=1; d<size; ++d) res *= v[d];
- return res;
- }
- real_t sum() const
- {
- real_t res = v[0];
- for (int d=1; d<size; ++d) res += v[d];
- return res;
- }
-
-
-
- boolvec_t operator==(realvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
- return res;
- }
- boolvec_t operator!=(realvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
- return res;
- }
- boolvec_t operator<(realvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d];
- return res;
- }
- boolvec_t operator<=(realvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d];
- return res;
- }
- boolvec_t operator>(realvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d];
- return res;
- }
- boolvec_t operator>=(realvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d];
- return res;
- }
-
-
-
- realvec_t acos() const { return map(vml_std::acos); }
- realvec_t acosh() const { return map(vml_std::acosh); }
- realvec_t asin() const { return map(vml_std::asin); }
- realvec_t asinh() const { return map(vml_std::asinh); }
- realvec_t atan() const { return map(vml_std::atan); }
- realvec_t atan2(realvec_t y) const
- {
- return MF::vml_atan2(*this, y);
- }
- realvec_t atanh() const { return map(vml_std::atanh); }
- realvec_t cbrt() const { return map(vml_std::cbrt); }
- realvec_t ceil() const { return map(vml_std::ceil); }
- realvec_t copysign(realvec_t y) const
- {
- return map(vml_std::copysign, y);
- }
- realvec_t cos() const { return map(vml_std::cos); }
- realvec_t cosh() const { return map(vml_std::cosh); }
- realvec_t exp() const { return map(vml_std::exp); }
- realvec_t exp10() const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = vml_std::exp(R(M_LN10) * v[d]);
- return res;
- }
- realvec_t exp2() const { return map(vml_std::exp2); }
- realvec_t expm1() const { return map(vml_std::expm1); }
- realvec_t fabs() const { return map(vml_std::fabs); }
- realvec_t fdim(realvec_t y) const { return map(vml_std::fdim, y); }
- realvec_t floor() const { return map(vml_std::floor); }
- realvec_t fma(realvec_t y, realvec_t z) const
- {
- return map(vml_std::fma, y, z);
- }
- realvec_t fmax(realvec_t y) const { return map(vml_std::fmax, y); }
- realvec_t fmin(realvec_t y) const { return map(vml_std::fmin, y); }
- realvec_t fmod(realvec_t y) const { return map(vml_std::fmod, y); }
- realvec_t frexp(intvec_t* ires) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) {
- int iri;
- real_t r = vml_std::frexp(v[d], &iri);
- int_t ir = iri;
+ }
+
+ typedef boolpseudovec<real_t, size> boolvec_t;
+ typedef intpseudovec<real_t, size> intvec_t;
+ typedef realpseudovec realvec_t;
+
+private:
+ boolvec_t mapb(bool f(real_t)) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = f(v[d]);
+ return res;
+ }
+ intvec_t map(int_t f(real_t)) const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = f(v[d]);
+ return res;
+ }
+ realvec_t map(real_t f(real_t)) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = f(v[d]);
+ return res;
+ }
+ realvec_t map(real_t f(real_t, int_t), intvec_t x) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = f(v[d], x.v[d]);
+ return res;
+ }
+ realvec_t map(real_t f(real_t, real_t), realvec_t x) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = f(v[d], x.v[d]);
+ return res;
+ }
+ realvec_t map(real_t f(real_t, real_t, real_t), realvec_t x,
+ realvec_t y) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = f(v[d], x.v[d], y.v[d]);
+ return res;
+ }
+
+public:
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realpseudovec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realpseudovec(realpseudovec const& x): v(x.v) {}
+ // realpseudovec& operator=(realpseudovec const& x) { return v=x.v, *this; }
+ realpseudovec(real_t a) {
+ for (int d = 0; d < size; ++d)
+ v[d] = a;
+ }
+ realpseudovec(real_t const *as) {
+ for (int d = 0; d < size; ++d)
+ v[d] = as[d];
+ }
+
+ real_t operator[](int n) const { return v[n]; }
+ realvec_t &set_elt(int n, real_t a) { return v[n] = a, *this; }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return loadu(p);
+ }
+ static realvec_t loadu(real_t const *p) {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = p[d];
+ return res;
+ }
+ static realvec_t loadu(real_t const *p, size_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ return m.m.ifthen(loada(p), *this);
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ return m.m.ifthen(loadu(p), *this);
+ }
+ realvec_t loadu(real_t const *p, size_t ioff, mask_t const &m) const {
+ return m.m.ifthen(loadu(p, ioff), *this);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storeu(p);
+ }
+ void storeu(real_t *p) const {
+ for (int d = 0; d < size; ++d)
+ p[d] = v[d];
+ }
+ void storeu(real_t *p, size_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storeu(p, m);
+ }
+ void storeu(real_t *p, mask_t const &m) const {
+ for (int d = 0; d < size; ++d)
+ if (m.m[d])
+ p[d] = v[d];
+ }
+ void storeu(real_t *p, size_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = FP::as_int(v[d]);
+ return res;
+ }
+ intvec_t convert_int() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = FP::convert_int(v[d]);
+ return res;
+ }
+
+ realvec_t operator+() const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = +v[d];
+ return res;
+ }
+ realvec_t operator-() const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = -v[d];
+ return res;
+ }
+
+ realvec_t &operator+=(realvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] += x.v[d];
+ return *this;
+ }
+ realvec_t &operator-=(realvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] -= x.v[d];
+ return *this;
+ }
+ realvec_t &operator*=(realvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] *= x.v[d];
+ return *this;
+ }
+ realvec_t &operator/=(realvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] /= x.v[d];
+ return *this;
+ }
+
+ realvec_t operator+(realvec_t x) const {
+ realvec_t res = *this;
+ return res += x;
+ }
+ realvec_t operator-(realvec_t x) const {
+ realvec_t res = *this;
+ return res -= x;
+ }
+ realvec_t operator*(realvec_t x) const {
+ realvec_t res = *this;
+ return res *= x;
+ }
+ realvec_t operator/(realvec_t x) const {
+ realvec_t res = *this;
+ return res /= x;
+ }
+
+ real_t maxval() const {
+ real_t res = v[0];
+ for (int d = 1; d < size; ++d)
+ res = vml_std::fmax(res, v[d]);
+ return res;
+ }
+ real_t minval() const {
+ real_t res = v[0];
+ for (int d = 1; d < size; ++d)
+ res = vml_std::fmin(res, v[d]);
+ return res;
+ }
+ real_t prod() const {
+ real_t res = v[0];
+ for (int d = 1; d < size; ++d)
+ res *= v[d];
+ return res;
+ }
+ real_t sum() const {
+ real_t res = v[0];
+ for (int d = 1; d < size; ++d)
+ res += v[d];
+ return res;
+ }
+
+ boolvec_t operator==(realvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] == x.v[d];
+ return res;
+ }
+ boolvec_t operator!=(realvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] != x.v[d];
+ return res;
+ }
+ boolvec_t operator<(realvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] < x.v[d];
+ return res;
+ }
+ boolvec_t operator<=(realvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] <= x.v[d];
+ return res;
+ }
+ boolvec_t operator>(realvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] > x.v[d];
+ return res;
+ }
+ boolvec_t operator>=(realvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] >= x.v[d];
+ return res;
+ }
+
+ realvec_t acos() const { return map(vml_std::acos); }
+ realvec_t acosh() const { return map(vml_std::acosh); }
+ realvec_t asin() const { return map(vml_std::asin); }
+ realvec_t asinh() const { return map(vml_std::asinh); }
+ realvec_t atan() const { return map(vml_std::atan); }
+ realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+ realvec_t atanh() const { return map(vml_std::atanh); }
+ realvec_t cbrt() const { return map(vml_std::cbrt); }
+ realvec_t ceil() const { return map(vml_std::ceil); }
+ realvec_t copysign(realvec_t y) const { return map(vml_std::copysign, y); }
+ realvec_t cos() const { return map(vml_std::cos); }
+ realvec_t cosh() const { return map(vml_std::cosh); }
+ realvec_t exp() const { return map(vml_std::exp); }
+ realvec_t exp10() const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = vml_std::exp(R(M_LN10) * v[d]);
+ return res;
+ }
+ realvec_t exp2() const { return map(vml_std::exp2); }
+ realvec_t expm1() const { return map(vml_std::expm1); }
+ realvec_t fabs() const { return map(vml_std::fabs); }
+ realvec_t fdim(realvec_t y) const { return map(vml_std::fdim, y); }
+ realvec_t floor() const { return map(vml_std::floor); }
+ realvec_t fma(realvec_t y, realvec_t z) const {
+ return map(vml_std::fma, y, z);
+ }
+ realvec_t fmax(realvec_t y) const { return map(vml_std::fmax, y); }
+ realvec_t fmin(realvec_t y) const { return map(vml_std::fmin, y); }
+ realvec_t fmod(realvec_t y) const { return map(vml_std::fmod, y); }
+ realvec_t frexp(intvec_t *ires) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d) {
+ int iri;
+ real_t r = vml_std::frexp(v[d], &iri);
+ int_t ir = iri;
#if defined VML_HAVE_INF
- if (vml_std::isinf(v[d])) ir = std::numeric_limits<int_t>::max();
+ if (vml_std::isinf(v[d]))
+ ir = std::numeric_limits<int_t>::max();
#endif
#if defined VML_HAVE_NAN
- if (vml_std::isnan(v[d])) ir = std::numeric_limits<int_t>::min();
+ if (vml_std::isnan(v[d]))
+ ir = std::numeric_limits<int_t>::min();
#endif
- res.v[d] = r;
- ires->v[d] = ir;
- }
- return res;
+ res.v[d] = r;
+ ires->v[d] = ir;
}
- realvec_t hypot(realvec_t y) const { return map(vml_std::hypot, y); }
- intvec_t ilogb() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) {
- int_t r = vml_std::ilogb(v[d]);
- typedef std::numeric_limits<int_t> NL;
- if (FP_ILOGB0 != NL::min() and v[d] == R(0.0)) {
- r = NL::min();
+ return res;
+ }
+ realvec_t hypot(realvec_t y) const { return map(vml_std::hypot, y); }
+ intvec_t ilogb() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d) {
+ int_t r = vml_std::ilogb(v[d]);
+ typedef std::numeric_limits<int_t> NL;
+ if (FP_ILOGB0 != NL::min() and v[d] == R(0.0)) {
+ r = NL::min();
#if defined VML_HAVE_INF
- } else if (INT_MAX != NL::max() and vml_std::isinf(v[d])) {
- r = NL::max();
+ } else if (INT_MAX != NL::max() and vml_std::isinf(v[d])) {
+ r = NL::max();
#endif
#if defined VML_HAVE_NAN
- } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v[d])) {
- r = NL::min();
+ } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v[d])) {
+ r = NL::min();
#endif
- }
- res.v[d] = r;
}
- return res;
+ res.v[d] = r;
}
- boolvec_t isfinite() const { return mapb(vml_std::isfinite); }
- boolvec_t isinf() const { return mapb(vml_std::isinf); }
- boolvec_t isnan() const { return mapb(vml_std::isnan); }
- boolvec_t isnormal() const { return mapb(vml_std::isnormal); }
- realvec_t ldexp(int_t n) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = vml_std::ldexp(v[d], n);
- return res;
- }
- realvec_t ldexp(intvec_t n) const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = vml_std::ldexp(v[d], n.v[d]);
- return res;
- }
- realvec_t log() const { return map(vml_std::log); }
- realvec_t log10() const { return map(vml_std::log10); }
- realvec_t log1p() const { return map(vml_std::log1p); }
- realvec_t log2() const { return map(vml_std::log2); }
- intvec_t lrint() const
- {
- realvec_t res;
- if (sizeof(int_t) <= sizeof(long)) {
- for (int d=0; d<size; ++d) res.v[d] = vml_std::lrint(v[d]);
- } else if (sizeof(int_t) <= sizeof(long long)) {
- for (int d=0; d<size; ++d) res.v[d] = vml_std::llrint(v[d]);
- } else {
- __builtin_unreachable();
- }
- return res;
- }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return MF::vml_mad(*this, y, z);
- }
- realvec_t nextafter(realvec_t y) const
- {
- return map(vml_std::nextafter, y);
- }
- realvec_t pow(realvec_t y) const { return map(vml_std::pow, y); }
- realvec_t rcp() const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = R(1.0) / v[d];
- return res;
- }
- realvec_t remainder(realvec_t y) const
- {
- return map(vml_std::remainder, y);
- }
- realvec_t rint() const { return map(vml_std::rint); }
- realvec_t round() const { return map(vml_std::round); }
- realvec_t rsqrt() const { return sqrt().rcp(); }
- boolvec_t signbit() const { return mapb(vml_std::signbit); }
- realvec_t sin() const { return map(vml_std::sin); }
- realvec_t sinh() const { return map(vml_std::sinh); }
- realvec_t sqrt() const { return map(vml_std::sqrt); }
- realvec_t tan() const { return map(vml_std::tan); }
- realvec_t tanh() const { return map(vml_std::tanh); }
- realvec_t trunc() const { return map(vml_std::trunc); }
- };
-
-
-
- // boolpseudovec definitions
-
- template<typename T, int N>
- inline
- typename boolpseudovec<T,N>::intvec_t boolpseudovec<T,N>::as_int() const
- {
- return convert_int();
- }
-
- template<typename T, int N>
- inline
- typename boolpseudovec<T,N>::intvec_t boolpseudovec<T,N>::convert_int() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d];
return res;
}
-
- template<typename T, int N>
- inline
- typename boolpseudovec<T,N>::boolvec_t
- boolpseudovec<T,N>::ifthen(boolvec_t x, boolvec_t y) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
- return res;
- }
-
- template<typename T, int N>
- inline
- typename boolpseudovec<T,N>::intvec_t
- boolpseudovec<T,N>::ifthen(intvec_t x, intvec_t y) const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+ boolvec_t isfinite() const { return mapb(vml_std::isfinite); }
+ boolvec_t isinf() const { return mapb(vml_std::isinf); }
+ boolvec_t isnan() const { return mapb(vml_std::isnan); }
+ boolvec_t isnormal() const { return mapb(vml_std::isnormal); }
+ realvec_t ldexp(int_t n) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = vml_std::ldexp(v[d], n);
return res;
}
-
- template<typename T, int N>
- inline
- typename boolpseudovec<T,N>::realvec_t
- boolpseudovec<T,N>::ifthen(realvec_t x, realvec_t y) const
- {
+ realvec_t ldexp(intvec_t n) const {
realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+ for (int d = 0; d < size; ++d)
+ res.v[d] = vml_std::ldexp(v[d], n.v[d]);
return res;
}
-
-
-
- // intpseudovec definitions
-
- template<typename T, int N>
- inline
- typename intpseudovec<T,N>::realvec_t intpseudovec<T,N>::as_float() const
- {
+ realvec_t log() const { return map(vml_std::log); }
+ realvec_t log10() const { return map(vml_std::log10); }
+ realvec_t log1p() const { return map(vml_std::log1p); }
+ realvec_t log2() const { return map(vml_std::log2); }
+ intvec_t lrint() const {
realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = FP::as_float(v[d]);
+ if (sizeof(int_t) <= sizeof(long)) {
+ for (int d = 0; d < size; ++d)
+ res.v[d] = vml_std::lrint(v[d]);
+ } else if (sizeof(int_t) <= sizeof(long long)) {
+ for (int d = 0; d < size; ++d)
+ res.v[d] = vml_std::llrint(v[d]);
+ } else {
+ __builtin_unreachable();
+ }
return res;
}
-
- template<typename T, int N>
- inline
- intpseudovec<T,N> intpseudovec<T,N>::bitifthen(intvec_t x, intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
- template<typename T, int N>
- inline
- typename intpseudovec<T,N>::realvec_t intpseudovec<T,N>::convert_float() const
- {
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return MF::vml_mad(*this, y, z);
+ }
+ realvec_t nextafter(realvec_t y) const { return map(vml_std::nextafter, y); }
+ realvec_t pow(realvec_t y) const { return map(vml_std::pow, y); }
+ realvec_t rcp() const {
realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = FP::convert_float(v[d]);
+ for (int d = 0; d < size; ++d)
+ res.v[d] = R(1.0) / v[d];
return res;
}
-
- template<typename T, int N>
- inline intpseudovec<T,N> intpseudovec<T,N>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
- template<typename T, int N>
- inline intpseudovec<T,N> intpseudovec<T,N>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
-
-
- // Wrappers
-
- // boolpseudovec wrappers
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> as_int(boolpseudovec<real_t, size> x)
- {
- return x.as_int();
- }
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> convert_int(boolpseudovec<real_t, size> x)
- {
- return x.convert_int();
- }
-
- template<typename real_t, int size>
- inline bool all(boolpseudovec<real_t, size> x) { return x.all(); }
-
- template<typename real_t, int size>
- inline bool any(boolpseudovec<real_t, size> x) { return x.any(); }
-
- template<typename real_t, int size>
- inline
- boolpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
- boolpseudovec<real_t, size> x,
- boolpseudovec<real_t, size> y)
- {
- return c.ifthen(x, y);
- }
-
- template<typename real_t, int size>
- inline
- intpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
- intpseudovec<real_t, size> x,
- intpseudovec<real_t, size> y)
- {
- return c.ifthen(x, y);
- }
-
- template<typename real_t, int size>
- inline
- realpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
- realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y)
- {
- return c.ifthen(x, y);
- }
-
-
-
- // intpseudovec wrappers
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> abs(intpseudovec<real_t, size> x)
- {
- return x.abs();
- }
-
- template<typename real_t, int size>
- inline boolpseudovec<real_t, size> as_bool(intpseudovec<real_t, size> x)
- {
- return x.as_bool();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> as_float(intpseudovec<real_t, size> x)
- {
- return x.as_float();
- }
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> bitifthen(intpseudovec<real_t, size> x,
- intpseudovec<real_t, size> y,
- intpseudovec<real_t, size> z)
- {
- return x.bitifthen(y, z);
- }
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> clz(intpseudovec<real_t, size> x)
- {
- return x.clz();
- }
-
- template<typename real_t, int size>
- inline boolpseudovec<real_t, size> convert_bool(intpseudovec<real_t, size> x)
- {
- return x.convert_bool();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> convert_float(intpseudovec<real_t, size> x)
- {
- return x.convert_float();
- }
-
- template<typename real_t, int size>
- inline boolpseudovec<real_t, size> isignbit(intpseudovec<real_t, size> x)
- {
- return x.isignbit();
- }
-
- template<typename real_t, int size>
- inline
- intpseudovec<real_t, size> lsr(intpseudovec<real_t, size> x,
- typename intpseudovec<real_t, size>::int_t n)
- {
- return x.lsr(n);
- }
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> lsr(intpseudovec<real_t, size> x,
- intpseudovec<real_t, size> n)
- {
- return x.lsr(n);
- }
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> max(intpseudovec<real_t, size> x,
- intpseudovec<real_t, size> y)
- {
- return x.max(y);
- }
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> min(intpseudovec<real_t, size> x,
- intpseudovec<real_t, size> y)
- {
- return x.min(y);
- }
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> popcount(intpseudovec<real_t, size> x)
- {
- return x.popcount();
- }
-
- template<typename real_t, int size>
- inline
- intpseudovec<real_t, size> rotate(intpseudovec<real_t, size> x,
- typename
- intpseudovec<real_t, size>::int_t n)
- {
- return x.rotate(n);
- }
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> rotate(intpseudovec<real_t, size> x,
- intpseudovec<real_t, size> n)
- {
- return x.rotate(n);
- }
-
-
-
- // realpseudovec wrappers
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size>
- loada(real_t const* p,
- realpseudovec<real_t, size> x,
- typename realpseudovec<real_t, size>::mask_t const& m)
- {
- return x.loada(p, m);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size>
- loadu(real_t const* p,
- realpseudovec<real_t, size> x,
- typename realpseudovec<real_t, size>::mask_t const& m)
- {
- return x.loadu(p, m);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size>
- loadu(real_t const* p, size_t ioff,
- realpseudovec<real_t, size> x,
- typename realpseudovec<real_t, size>::mask_t const& m)
- {
- return x.loadu(p, ioff, m);
- }
-
- template<typename real_t, int size>
- inline void storea(realpseudovec<real_t, size> x, real_t* p)
- {
- return x.storea(p);
- }
-
- template<typename real_t, int size>
- inline void storeu(realpseudovec<real_t, size> x, real_t* p)
- {
- return x.storeu(p);
- }
-
- template<typename real_t, int size>
- inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff)
- {
- return x.storeu(p, ioff);
- }
-
- template<typename real_t, int size>
- inline void storea(realpseudovec<real_t, size> x, real_t* p,
- typename realpseudovec<real_t, size>::mask_t const& m)
- {
- return x.storea(p, m);
- }
-
- template<typename real_t, int size>
- inline void storeu(realpseudovec<real_t, size> x, real_t* p,
- typename realpseudovec<real_t, size>::mask_t const& m)
- {
- return x.storeu(p, m);
- }
-
- template<typename real_t, int size>
- inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff,
- typename realpseudovec<real_t, size>::mask_t const& m)
- {
- return x.storeu(p, ioff, m);
- }
-
-
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> as_int(realpseudovec<real_t, size> x)
- {
- return x.as_int();
- }
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> convert_int(realpseudovec<real_t, size> x)
- {
- return x.convert_int();
- }
-
- template<typename real_t, int size>
- inline real_t maxval(realpseudovec<real_t, size> x)
- {
- return x.maxval();
- }
-
- template<typename real_t, int size>
- inline real_t minval(realpseudovec<real_t, size> x)
- {
- return x.minval();
- }
-
- template<typename real_t, int size>
- inline real_t prod(realpseudovec<real_t, size> x)
- {
- return x.prod();
- }
-
- template<typename real_t, int size>
- inline real_t sum(realpseudovec<real_t, size> x)
- {
- return x.sum();
- }
-
-
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> acos(realpseudovec<real_t, size> x)
- {
- return x.acos();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> acosh(realpseudovec<real_t, size> x)
- {
- return x.acosh();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> asin(realpseudovec<real_t, size> x)
- {
- return x.asin();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> asinh(realpseudovec<real_t, size> x)
- {
- return x.asinh();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> atan(realpseudovec<real_t, size> x)
- {
- return x.atan();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> atan2(realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y)
- {
- return x.atan2(y);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> atanh(realpseudovec<real_t, size> x)
- {
- return x.atanh();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> cbrt(realpseudovec<real_t, size> x)
- {
- return x.cbrt();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> ceil(realpseudovec<real_t, size> x)
- {
- return x.ceil();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> copysign(realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y)
- {
- return x.copysign(y);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> cos(realpseudovec<real_t, size> x)
- {
- return x.cos();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> cosh(realpseudovec<real_t, size> x)
- {
- return x.cosh();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> exp(realpseudovec<real_t, size> x)
- {
- return x.exp();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> exp10(realpseudovec<real_t, size> x)
- {
- return x.exp10();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> exp2(realpseudovec<real_t, size> x)
- {
- return x.exp2();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> expm1(realpseudovec<real_t, size> x)
- {
- return x.expm1();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> fabs(realpseudovec<real_t, size> x)
- {
- return x.fabs();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> floor(realpseudovec<real_t, size> x)
- {
- return x.floor();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> fdim(realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y)
- {
- return x.fdim(y);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> fma(realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y,
- realpseudovec<real_t, size> z)
- {
- return x.fma(y, z);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> fmax(realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y)
- {
- return x.fmax(y);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> fmin(realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y)
- {
- return x.fmin(y);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> fmod(realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y)
- {
- return x.fmod(y);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> frexp(realpseudovec<real_t, size> x,
- intpseudovec<real_t, size>* r)
- {
- return x.frexp(r);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> hypot(realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y)
- {
- return x.hypot(y);
- }
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> ilogb(realpseudovec<real_t, size> x)
- {
- return x.ilogb();
- }
-
- template<typename real_t, int size>
- inline boolpseudovec<real_t, size> isfinite(realpseudovec<real_t, size> x)
- {
- return x.isfinite();
- }
-
- template<typename real_t, int size>
- inline boolpseudovec<real_t, size> isinf(realpseudovec<real_t, size> x)
- {
- return x.isinf();
- }
-
- template<typename real_t, int size>
- inline boolpseudovec<real_t, size> isnan(realpseudovec<real_t, size> x)
- {
- return x.isnan();
- }
-
- template<typename real_t, int size>
- inline boolpseudovec<real_t, size> isnormal(realpseudovec<real_t, size> x)
- {
- return x.isnormal();
- }
-
- template<typename real_t, int size>
- inline
- realpseudovec<real_t, size> ldexp(realpseudovec<real_t, size> x,
- typename intpseudovec<real_t, size>::int_t
- n)
- {
- return x.ldexp(n);
- }
-
- template<typename real_t, int size>
- inline
- realpseudovec<real_t, size> ldexp(realpseudovec<real_t, size> x,
- intpseudovec<real_t, size> n)
- {
- return x.ldexp(n);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> log(realpseudovec<real_t, size> x)
- {
- return x.log();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> log10(realpseudovec<real_t, size> x)
- {
- return x.log10();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> log1p(realpseudovec<real_t, size> x)
- {
- return x.log1p();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> log2(realpseudovec<real_t, size> x)
- {
- return x.log2();
- }
-
- template<typename real_t, int size>
- inline intpseudovec<real_t, size> lrint(realpseudovec<real_t, size> x)
- {
- return x.lrint();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> mad(realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y,
- realpseudovec<real_t, size> z)
- {
- return x.mad(y, z);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> nextafter(realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y)
- {
- return x.nextafter(y);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> pow(realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y)
- {
- return x.pow(y);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> rcp(realpseudovec<real_t, size> x)
- {
- return x.rcp();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> remainder(realpseudovec<real_t, size> x,
- realpseudovec<real_t, size> y)
- {
- return x.remainder(y);
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> rint(realpseudovec<real_t, size> x)
- {
- return x.rint();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> round(realpseudovec<real_t, size> x)
- {
- return x.round();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> rsqrt(realpseudovec<real_t, size> x)
- {
- return x.rsqrt();
- }
-
- template<typename real_t, int size>
- inline boolpseudovec<real_t, size> signbit(realpseudovec<real_t, size> x)
- {
- return x.signbit();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> sin(realpseudovec<real_t, size> x)
- {
- return x.sin();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> sinh(realpseudovec<real_t, size> x)
- {
- return x.sinh();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> sqrt(realpseudovec<real_t, size> x)
- {
- return x.sqrt();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> tan(realpseudovec<real_t, size> x)
- {
- return x.tan();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> tanh(realpseudovec<real_t, size> x)
- {
- return x.tanh();
- }
-
- template<typename real_t, int size>
- inline realpseudovec<real_t, size> trunc(realpseudovec<real_t, size> x)
- {
- return x.trunc();
- }
-
-
-
+ realvec_t remainder(realvec_t y) const { return map(vml_std::remainder, y); }
+ realvec_t rint() const { return map(vml_std::rint); }
+ realvec_t round() const { return map(vml_std::round); }
+ realvec_t rsqrt() const { return sqrt().rcp(); }
+ boolvec_t signbit() const { return mapb(vml_std::signbit); }
+ realvec_t sin() const { return map(vml_std::sin); }
+ realvec_t sinh() const { return map(vml_std::sinh); }
+ realvec_t sqrt() const { return map(vml_std::sqrt); }
+ realvec_t tan() const { return map(vml_std::tan); }
+ realvec_t tanh() const { return map(vml_std::tanh); }
+ realvec_t trunc() const { return map(vml_std::trunc); }
+};
+
+// boolpseudovec definitions
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::intvec_t
+boolpseudovec<T, N>::as_int() const {
+ return convert_int();
+}
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::intvec_t
+boolpseudovec<T, N>::convert_int() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d];
+ return res;
+}
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::boolvec_t
+boolpseudovec<T, N>::ifthen(boolvec_t x, boolvec_t y) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] ? x.v[d] : y.v[d];
+ return res;
+}
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::intvec_t
+boolpseudovec<T, N>::ifthen(intvec_t x, intvec_t y) const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] ? x.v[d] : y.v[d];
+ return res;
+}
+
+template <typename T, int N>
+inline typename boolpseudovec<T, N>::realvec_t
+boolpseudovec<T, N>::ifthen(realvec_t x, realvec_t y) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] ? x.v[d] : y.v[d];
+ return res;
+}
+
+// intpseudovec definitions
+
+template <typename T, int N>
+inline typename intpseudovec<T, N>::realvec_t
+intpseudovec<T, N>::as_float() const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = FP::as_float(v[d]);
+ return res;
+}
+
+template <typename T, int N>
+inline intpseudovec<T, N> intpseudovec<T, N>::bitifthen(intvec_t x,
+ intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+}
+
+template <typename T, int N>
+inline typename intpseudovec<T, N>::realvec_t
+intpseudovec<T, N>::convert_float() const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = FP::convert_float(v[d]);
+ return res;
+}
+
+template <typename T, int N>
+inline intpseudovec<T, N> intpseudovec<T, N>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+template <typename T, int N>
+inline intpseudovec<T, N> intpseudovec<T, N>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+// Wrappers
+
+// boolpseudovec wrappers
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> as_int(boolpseudovec<real_t, size> x) {
+ return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> convert_int(boolpseudovec<real_t, size> x) {
+ return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline bool all(boolpseudovec<real_t, size> x) {
+ return x.all();
+}
+
+template <typename real_t, int size>
+inline bool any(boolpseudovec<real_t, size> x) {
+ return x.any();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
+ boolpseudovec<real_t, size> x,
+ boolpseudovec<real_t, size> y) {
+ return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
+ intpseudovec<real_t, size> x,
+ intpseudovec<real_t, size> y) {
+ return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c,
+ realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y) {
+ return c.ifthen(x, y);
+}
+
+// intpseudovec wrappers
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> abs(intpseudovec<real_t, size> x) {
+ return x.abs();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> as_bool(intpseudovec<real_t, size> x) {
+ return x.as_bool();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> as_float(intpseudovec<real_t, size> x) {
+ return x.as_float();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> bitifthen(intpseudovec<real_t, size> x,
+ intpseudovec<real_t, size> y,
+ intpseudovec<real_t, size> z) {
+ return x.bitifthen(y, z);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> clz(intpseudovec<real_t, size> x) {
+ return x.clz();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> convert_bool(intpseudovec<real_t, size> x) {
+ return x.convert_bool();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> convert_float(intpseudovec<real_t, size> x) {
+ return x.convert_float();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isignbit(intpseudovec<real_t, size> x) {
+ return x.isignbit();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size>
+lsr(intpseudovec<real_t, size> x,
+ typename intpseudovec<real_t, size>::int_t n) {
+ return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> lsr(intpseudovec<real_t, size> x,
+ intpseudovec<real_t, size> n) {
+ return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> max(intpseudovec<real_t, size> x,
+ intpseudovec<real_t, size> y) {
+ return x.max(y);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> min(intpseudovec<real_t, size> x,
+ intpseudovec<real_t, size> y) {
+ return x.min(y);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> popcount(intpseudovec<real_t, size> x) {
+ return x.popcount();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size>
+rotate(intpseudovec<real_t, size> x,
+ typename intpseudovec<real_t, size>::int_t n) {
+ return x.rotate(n);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> rotate(intpseudovec<real_t, size> x,
+ intpseudovec<real_t, size> n) {
+ return x.rotate(n);
+}
+
+// realpseudovec wrappers
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size>
+loada(real_t const *p, realpseudovec<real_t, size> x,
+ typename realpseudovec<real_t, size>::mask_t const &m) {
+ return x.loada(p, m);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size>
+loadu(real_t const *p, realpseudovec<real_t, size> x,
+ typename realpseudovec<real_t, size>::mask_t const &m) {
+ return x.loadu(p, m);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size>
+loadu(real_t const *p, size_t ioff, realpseudovec<real_t, size> x,
+ typename realpseudovec<real_t, size>::mask_t const &m) {
+ return x.loadu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline void storea(realpseudovec<real_t, size> x, real_t *p) {
+ return x.storea(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realpseudovec<real_t, size> x, real_t *p) {
+ return x.storeu(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realpseudovec<real_t, size> x, real_t *p, size_t ioff) {
+ return x.storeu(p, ioff);
+}
+
+template <typename real_t, int size>
+inline void storea(realpseudovec<real_t, size> x, real_t *p,
+ typename realpseudovec<real_t, size>::mask_t const &m) {
+ return x.storea(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realpseudovec<real_t, size> x, real_t *p,
+ typename realpseudovec<real_t, size>::mask_t const &m) {
+ return x.storeu(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realpseudovec<real_t, size> x, real_t *p, size_t ioff,
+ typename realpseudovec<real_t, size>::mask_t const &m) {
+ return x.storeu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> as_int(realpseudovec<real_t, size> x) {
+ return x.as_int();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> convert_int(realpseudovec<real_t, size> x) {
+ return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline real_t maxval(realpseudovec<real_t, size> x) {
+ return x.maxval();
+}
+
+template <typename real_t, int size>
+inline real_t minval(realpseudovec<real_t, size> x) {
+ return x.minval();
+}
+
+template <typename real_t, int size>
+inline real_t prod(realpseudovec<real_t, size> x) {
+ return x.prod();
+}
+
+template <typename real_t, int size>
+inline real_t sum(realpseudovec<real_t, size> x) {
+ return x.sum();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> acos(realpseudovec<real_t, size> x) {
+ return x.acos();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> acosh(realpseudovec<real_t, size> x) {
+ return x.acosh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> asin(realpseudovec<real_t, size> x) {
+ return x.asin();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> asinh(realpseudovec<real_t, size> x) {
+ return x.asinh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> atan(realpseudovec<real_t, size> x) {
+ return x.atan();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> atan2(realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y) {
+ return x.atan2(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> atanh(realpseudovec<real_t, size> x) {
+ return x.atanh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> cbrt(realpseudovec<real_t, size> x) {
+ return x.cbrt();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> ceil(realpseudovec<real_t, size> x) {
+ return x.ceil();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> copysign(realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y) {
+ return x.copysign(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> cos(realpseudovec<real_t, size> x) {
+ return x.cos();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> cosh(realpseudovec<real_t, size> x) {
+ return x.cosh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> exp(realpseudovec<real_t, size> x) {
+ return x.exp();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> exp10(realpseudovec<real_t, size> x) {
+ return x.exp10();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> exp2(realpseudovec<real_t, size> x) {
+ return x.exp2();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> expm1(realpseudovec<real_t, size> x) {
+ return x.expm1();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fabs(realpseudovec<real_t, size> x) {
+ return x.fabs();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> floor(realpseudovec<real_t, size> x) {
+ return x.floor();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fdim(realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y) {
+ return x.fdim(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fma(realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y,
+ realpseudovec<real_t, size> z) {
+ return x.fma(y, z);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fmax(realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y) {
+ return x.fmax(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fmin(realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y) {
+ return x.fmin(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> fmod(realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y) {
+ return x.fmod(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> frexp(realpseudovec<real_t, size> x,
+ intpseudovec<real_t, size> *r) {
+ return x.frexp(r);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> hypot(realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y) {
+ return x.hypot(y);
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> ilogb(realpseudovec<real_t, size> x) {
+ return x.ilogb();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isfinite(realpseudovec<real_t, size> x) {
+ return x.isfinite();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isinf(realpseudovec<real_t, size> x) {
+ return x.isinf();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isnan(realpseudovec<real_t, size> x) {
+ return x.isnan();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> isnormal(realpseudovec<real_t, size> x) {
+ return x.isnormal();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size>
+ldexp(realpseudovec<real_t, size> x,
+ typename intpseudovec<real_t, size>::int_t n) {
+ return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> ldexp(realpseudovec<real_t, size> x,
+ intpseudovec<real_t, size> n) {
+ return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> log(realpseudovec<real_t, size> x) {
+ return x.log();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> log10(realpseudovec<real_t, size> x) {
+ return x.log10();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> log1p(realpseudovec<real_t, size> x) {
+ return x.log1p();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> log2(realpseudovec<real_t, size> x) {
+ return x.log2();
+}
+
+template <typename real_t, int size>
+inline intpseudovec<real_t, size> lrint(realpseudovec<real_t, size> x) {
+ return x.lrint();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> mad(realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y,
+ realpseudovec<real_t, size> z) {
+ return x.mad(y, z);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> nextafter(realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y) {
+ return x.nextafter(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> pow(realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y) {
+ return x.pow(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> rcp(realpseudovec<real_t, size> x) {
+ return x.rcp();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> remainder(realpseudovec<real_t, size> x,
+ realpseudovec<real_t, size> y) {
+ return x.remainder(y);
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> rint(realpseudovec<real_t, size> x) {
+ return x.rint();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> round(realpseudovec<real_t, size> x) {
+ return x.round();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> rsqrt(realpseudovec<real_t, size> x) {
+ return x.rsqrt();
+}
+
+template <typename real_t, int size>
+inline boolpseudovec<real_t, size> signbit(realpseudovec<real_t, size> x) {
+ return x.signbit();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> sin(realpseudovec<real_t, size> x) {
+ return x.sin();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> sinh(realpseudovec<real_t, size> x) {
+ return x.sinh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> sqrt(realpseudovec<real_t, size> x) {
+ return x.sqrt();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> tan(realpseudovec<real_t, size> x) {
+ return x.tan();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> tanh(realpseudovec<real_t, size> x) {
+ return x.tanh();
+}
+
+template <typename real_t, int size>
+inline realpseudovec<real_t, size> trunc(realpseudovec<real_t, size> x) {
+ return x.trunc();
+}
+
#ifndef VML_NO_IOSTREAM
- template<typename real_t, int size>
- std::ostream& operator<<(std::ostream& os,
- boolpseudovec<real_t, size> const& x)
- {
- os << "[";
- for (int i=0; i<size; ++i) {
- if (i!=0) os << ",";
- os << x[i];
- }
- os << "]";
- return os;
- }
-
- template<typename real_t, int size>
- std::ostream& operator<<(std::ostream& os,
- intpseudovec<real_t, size> const& x)
- {
- os << "[";
- for (int i=0; i<size; ++i) {
- if (i!=0) os << ",";
- os << x[i];
- }
- os << "]";
- return os;
- }
-
- template<typename real_t, int size>
- std::ostream& operator<<(std::ostream& os,
- realpseudovec<real_t, size> const& x)
- {
- os << "[";
- for (int i=0; i<size; ++i) {
- if (i!=0) os << ",";
- os << x[i];
- }
- os << "]";
- return os;
- }
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+ boolpseudovec<real_t, size> const &x) {
+ os << "[";
+ for (int i = 0; i < size; ++i) {
+ if (i != 0)
+ os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+ intpseudovec<real_t, size> const &x) {
+ os << "[";
+ for (int i = 0; i < size; ++i) {
+ if (i != 0)
+ os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os,
+ realpseudovec<real_t, size> const &x) {
+ os << "[";
+ for (int i = 0; i < size; ++i) {
+ if (i != 0)
+ os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+}
#endif
-
+
} // namespace vecmathlib
-#endif // #ifndef VEC_PSEUDO_H
+#endif // #ifndef VEC_PSEUDO_H
diff --git a/vec_qpx_double4.h b/vec_qpx_double4.h
index 9fa6bd0..b88b0da 100644
--- a/vec_qpx_double4.h
+++ b/vec_qpx_double4.h
@@ -11,785 +11,662 @@
// QPX intrinsics
#ifdef __clang__
-# include <qpxintrin.h>
+#include <qpxintrin.h>
#else
-# include <builtins.h>
+#include <builtins.h>
#endif
#include <mass_simd.h>
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_DOUBLE_4
- template<> struct boolvec<double,4>;
- template<> struct intvec<double,4>;
- template<> struct realvec<double,4>;
-
-
-
- template<>
- struct boolvec<double,4>: floatprops<double>
- {
- static int const size = 4;
- typedef bool scalar_t;
- typedef vector4double bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- private:
- // canonical true is +1.0, canonical false is -1.0
- // >=0 is true, -0 is true, nan is false
- static real_t from_bool(bool a) { return a ? +1.0 : -1.0; }
- static bool to_bool(real_t a) { return a>=0.0; }
- public:
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a): v(vec_splats(from_bool(a))) {}
- boolvec(const bool* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const
- {
- return to_bool(v[n]);
- }
- boolvec& set_elt(int n, bool a)
- {
- return v[n]=from_bool(a), *this;
- }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec operator!() const { return vec_not(v); }
-
- boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
- boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
- boolvec operator==(boolvec x) const
- {
- return vec_logical(v, x.v, 0x9);
- }
- boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
-
- bool all() const
- {
- // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
- boolvec x0123 = *this;
- boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
- boolvec y0022 = x0123 && x1032;
- return y0022[0] && y0022[2];
- }
- bool any() const
- {
- // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
- boolvec x0123 = *this;
- boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
- boolvec y0022 = x0123 || x1032;
- return y0022[0] || y0022[2];
- }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<double,4>: floatprops<double>
- {
- static int const size = 4;
- typedef int_t scalar_t;
- typedef vector4double ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(const intvec& x): v(x.v) {}
- // intvec& operator=(const intvec& x) { return v=x.v, *this; }
- intvec(ivector_t x): v(x) {}
- intvec(int_t a): v(vec_splats(FP::as_float(a))) {}
- intvec(const int_t* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
- static intvec iota()
- {
- const int_t iota_[] = {0, 1, 2, 3};
- return intvec(iota_);
- }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const
- {
- return FP::as_int(v[n]);
- }
- intvec& set_elt(int n, int_t a)
- {
- return v[n]=FP::as_float(a), *this;
- }
-
-
-
- // Vector casts do not change the bit battern
- boolvec_t as_bool() const { return v; }
- boolvec_t convert_bool() const { return *this != IV(I(0)); }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- intvec operator+() const { return *this; }
- intvec operator-() const
- {
- intvec r;
- for (int d=0; d<size; ++d) r.set_elt(d, -(*this)[d]);
- return r;
- }
-
- intvec operator+(intvec x) const
- {
- intvec r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] + x[d]);
- return r;
- }
- intvec operator-(intvec x) const
- {
- intvec r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] - x[d]);
- return r;
- }
-
- intvec& operator+=(intvec x) { return *this=*this+x; }
- intvec& operator-=(intvec x) { return *this=*this-x; }
-
-
-
- intvec operator~() const
- {
- intvec r;
- for (int d=0; d<size; ++d) r.set_elt(d, ~(*this)[d]);
- return r;
- }
-
- intvec operator&(intvec x) const
- {
- intvec r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] & x[d]);
- return r;
- }
- intvec operator|(intvec x) const
- {
- intvec r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] | x[d]);
- return r;
- }
- intvec operator^(intvec x) const
- {
- intvec r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] ^ x[d]);
- return r;
- }
-
- intvec& operator&=(intvec x) { return *this=*this&x; }
- intvec& operator|=(intvec x) { return *this=*this|x; }
- intvec& operator^=(intvec x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const;
-
-
-
- intvec_t lsr(int_t n) const
- {
- intvec_t r;
- for (int d=0; d<size; ++d) r.set_elt(d, U((*this)[d]) >> U(n));
- return r;
- }
- intvec_t rotate(int_t n) const;
- intvec operator>>(int_t n) const
- {
- intvec r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >> n);
- return r;
- }
- intvec operator<<(int_t n) const
- {
- intvec r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] << n);
- return r;
- }
- intvec& operator>>=(int_t n) { return *this=*this>>n; }
- intvec& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec_t lsr(intvec_t n) const
- {
- intvec_t r;
- for (int d=0; d<size; ++d) r.set_elt(d, U((*this)[d]) >> U(n[d]));
- return r;
- }
- intvec_t rotate(intvec_t n) const;
- intvec operator>>(intvec n) const
- {
- intvec r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >> n[d]);
- return r;
- }
- intvec operator<<(intvec n) const
- {
- intvec r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] << n[d]);
- return r;
- }
- intvec& operator>>=(intvec n) { return *this=*this>>n; }
- intvec& operator<<=(intvec n) { return *this=*this<<n; }
-
- intvec_t clz() const;
- intvec_t popcount() const;
-
-
-
- boolvec_t operator==(intvec x) const
- {
- boolvec_t r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] == x[d]);
- return r;
- }
- boolvec_t operator!=(intvec x) const
- {
- boolvec_t r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] != x[d]);
- return r;
- }
- boolvec_t operator<(intvec x) const
- {
- boolvec_t r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] < x[d]);
- return r;
- }
- boolvec_t operator<=(intvec x) const
- {
- boolvec_t r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] <= x[d]);
- return r;
- }
- boolvec_t operator>(intvec x) const
- {
- boolvec_t r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] > x[d]);
- return r;
- }
- boolvec_t operator>=(intvec x) const
- {
- boolvec_t r;
- for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >= x[d]);
- return r;
- }
-
- intvec_t abs() const;
- boolvec_t isignbit() const;
- intvec_t max(intvec_t x) const;
- intvec_t min(intvec_t x) const;
- };
-
-
-
- template<>
- struct realvec<double,4>: floatprops<double>
- {
- static int const size = 4;
- typedef real_t scalar_t;
- typedef vector4double vector_t;
- static int const alignment = sizeof(vector_t);
-
- static const char* name() { return "<QPX:4*double>"; }
- void barrier() { __asm__("": "+v"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(const realvec& x): v(x.v) {}
- // realvec& operator=(const realvec& x) { return v=x.v, *this; }
- realvec(vector_t x): v(x) {}
- realvec(real_t a): v(vec_splats(a)) {}
- realvec(const real_t* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const
- {
- return v[n];
- }
- realvec& set_elt(int n, real_t a)
- {
- return v[n]=a, *this;
- }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(const real_t* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return vec_lda(0, (real_t*)p);
- }
- static realvec_t loadu(const real_t* p)
- {
- realvec_t v0 = vec_ld(0, (real_t*)p);
- realvec_t v1 = vec_ld(31, (real_t*)p);
- return vec_perm(v0.v, v1.v, vec_lvsl(0, (real_t*)p));
- }
- static realvec_t loadu(const real_t* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff);
- // TODO: use load instruction with fixed offset
- return loadu(p+ioff);
- }
- realvec_t loada(const real_t* p, mask_t m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return m.m.ifthen(loada(p), *this);
- }
- }
- realvec_t loadu(const real_t* p, mask_t m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return m.m.ifthen(loadu(p), *this);
- }
- }
- realvec_t loadu(const real_t* p, std::ptrdiff_t ioff, mask_t m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff, m);
- // TODO: use load instruction with fixed offset
- return loadu(p+ioff, m);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- vec_sta(v, 0, p);
- }
- void storeu(real_t* p) const
- {
- // Vector stores would require vector loads, which would need to
- // be atomic
- // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
- p[0] = (*this)[0];
- p[1] = (*this)[1];
- p[2] = (*this)[2];
- p[3] = (*this)[3];
- }
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff);
- storeu(p+ioff);
- }
- void storea(real_t* p, mask_t m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- } else {
- if (m.m[0]) p[0] = (*this)[0];
- if (m.m[1]) p[1] = (*this)[1];
- if (m.m[2]) p[2] = (*this)[2];
- if (m.m[3]) p[3] = (*this)[3];
- }
- }
- void storeu(real_t* p, mask_t m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- } else {
- if (m.m[0]) p[0] = (*this)[0];
- if (m.m[1]) p[1] = (*this)[1];
- if (m.m[2]) p[2] = (*this)[2];
- if (m.m[3]) p[3] = (*this)[3];
- }
- }
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff, m);
- storeu(p+ioff, m);
- }
-
-
-
- intvec_t as_int() const { return v; }
- intvec_t convert_int() const { return vec_ctidz(v); }
-
-
-
- realvec operator+() const { return *this; }
- realvec operator-() const { return vec_neg(v); }
-
- realvec operator+(realvec x) const { return vec_add(v, x.v); }
- realvec operator-(realvec x) const { return vec_sub(v, x.v); }
- realvec operator*(realvec x) const { return vec_mul(v, x.v); }
- realvec operator/(realvec x) const
- {
- // return vec_swdiv_nochk(v, x.v);
- return div_fastd4(v, x.v);
- }
-
- realvec& operator+=(realvec x) { return *this=*this+x; }
- realvec& operator-=(realvec x) { return *this=*this-x; }
- realvec& operator*=(realvec x) { return *this=*this*x; }
- realvec& operator/=(realvec x) { return *this=*this/x; }
-
- real_t maxval() const
- {
- // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
- // vml_std::fmax((*this)[2], (*this)[3]));
- realvec_t x0123 = *this;
- realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
- realvec_t y0022 = x0123.fmax(x1032);
- return vml_std::fmax(y0022[0], y0022[2]);
+template <> struct boolvec<double, 4>;
+template <> struct intvec<double, 4>;
+template <> struct realvec<double, 4>;
+
+template <> struct boolvec<double, 4> : floatprops<double> {
+ static int const size = 4;
+ typedef bool scalar_t;
+ typedef vector4double bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+private:
+ // canonical true is +1.0, canonical false is -1.0
+ // >=0 is true, -0 is true, nan is false
+ static real_t from_bool(bool a) { return a ? +1.0 : -1.0; }
+ static bool to_bool(real_t a) { return a >= 0.0; }
+
+public:
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v(vec_splats(from_bool(a))) {}
+ boolvec(const bool *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const { return to_bool(v[n]); }
+ boolvec &set_elt(int n, bool a) { return v[n] = from_bool(a), *this; }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec operator!() const { return vec_not(v); }
+
+ boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
+ boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
+ boolvec operator==(boolvec x) const { return vec_logical(v, x.v, 0x9); }
+ boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
+
+ bool all() const {
+ // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+ boolvec x0123 = *this;
+ boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
+ boolvec y0022 = x0123 && x1032;
+ return y0022[0] && y0022[2];
+ }
+ bool any() const {
+ // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+ boolvec x0123 = *this;
+ boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
+ boolvec y0022 = x0123 || x1032;
+ return y0022[0] || y0022[2];
+ }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 4> : floatprops<double> {
+ static int const size = 4;
+ typedef int_t scalar_t;
+ typedef vector4double ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(const intvec& x): v(x.v) {}
+ // intvec& operator=(const intvec& x) { return v=x.v, *this; }
+ intvec(ivector_t x) : v(x) {}
+ intvec(int_t a) : v(vec_splats(FP::as_float(a))) {}
+ intvec(const int_t *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+ static intvec iota() {
+ const int_t iota_[] = {0, 1, 2, 3};
+ return intvec(iota_);
+ }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const { return FP::as_int(v[n]); }
+ intvec &set_elt(int n, int_t a) { return v[n] = FP::as_float(a), *this; }
+
+ // Vector casts do not change the bit battern
+ boolvec_t as_bool() const { return v; }
+ boolvec_t convert_bool() const { return *this != IV(I(0)); }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ intvec operator+() const { return *this; }
+ intvec operator-() const {
+ intvec r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, -(*this)[d]);
+ return r;
+ }
+
+ intvec operator+(intvec x) const {
+ intvec r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] + x[d]);
+ return r;
+ }
+ intvec operator-(intvec x) const {
+ intvec r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] - x[d]);
+ return r;
+ }
+
+ intvec &operator+=(intvec x) { return *this = *this + x; }
+ intvec &operator-=(intvec x) { return *this = *this - x; }
+
+ intvec operator~() const {
+ intvec r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, ~(*this)[d]);
+ return r;
+ }
+
+ intvec operator&(intvec x) const {
+ intvec r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] & x[d]);
+ return r;
+ }
+ intvec operator|(intvec x) const {
+ intvec r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] | x[d]);
+ return r;
+ }
+ intvec operator^(intvec x) const {
+ intvec r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] ^ x[d]);
+ return r;
+ }
+
+ intvec &operator&=(intvec x) { return *this = *this & x; }
+ intvec &operator|=(intvec x) { return *this = *this | x; }
+ intvec &operator^=(intvec x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+ intvec_t lsr(int_t n) const {
+ intvec_t r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, U((*this)[d]) >> U(n));
+ return r;
+ }
+ intvec_t rotate(int_t n) const;
+ intvec operator>>(int_t n) const {
+ intvec r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] >> n);
+ return r;
+ }
+ intvec operator<<(int_t n) const {
+ intvec r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] << n);
+ return r;
+ }
+ intvec &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec_t lsr(intvec_t n) const {
+ intvec_t r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, U((*this)[d]) >> U(n[d]));
+ return r;
+ }
+ intvec_t rotate(intvec_t n) const;
+ intvec operator>>(intvec n) const {
+ intvec r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] >> n[d]);
+ return r;
+ }
+ intvec operator<<(intvec n) const {
+ intvec r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] << n[d]);
+ return r;
+ }
+ intvec &operator>>=(intvec n) { return *this = *this >> n; }
+ intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+ intvec_t clz() const;
+ intvec_t popcount() const;
+
+ boolvec_t operator==(intvec x) const {
+ boolvec_t r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] == x[d]);
+ return r;
+ }
+ boolvec_t operator!=(intvec x) const {
+ boolvec_t r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] != x[d]);
+ return r;
+ }
+ boolvec_t operator<(intvec x) const {
+ boolvec_t r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] < x[d]);
+ return r;
+ }
+ boolvec_t operator<=(intvec x) const {
+ boolvec_t r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] <= x[d]);
+ return r;
+ }
+ boolvec_t operator>(intvec x) const {
+ boolvec_t r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] > x[d]);
+ return r;
+ }
+ boolvec_t operator>=(intvec x) const {
+ boolvec_t r;
+ for (int d = 0; d < size; ++d)
+ r.set_elt(d, (*this)[d] >= x[d]);
+ return r;
+ }
+
+ intvec_t abs() const;
+ boolvec_t isignbit() const;
+ intvec_t max(intvec_t x) const;
+ intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 4> : floatprops<double> {
+ static int const size = 4;
+ typedef real_t scalar_t;
+ typedef vector4double vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static const char *name() { return "<QPX:4*double>"; }
+ void barrier() { __asm__("" : "+v"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(const realvec& x): v(x.v) {}
+ // realvec& operator=(const realvec& x) { return v=x.v, *this; }
+ realvec(vector_t x) : v(x) {}
+ realvec(real_t a) : v(vec_splats(a)) {}
+ realvec(const real_t *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const { return v[n]; }
+ realvec &set_elt(int n, real_t a) { return v[n] = a, *this; }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(const real_t *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return vec_lda(0, (real_t *)p);
+ }
+ static realvec_t loadu(const real_t *p) {
+ realvec_t v0 = vec_ld(0, (real_t *)p);
+ realvec_t v1 = vec_ld(31, (real_t *)p);
+ return vec_perm(v0.v, v1.v, vec_lvsl(0, (real_t *)p));
+ }
+ static realvec_t loadu(const real_t *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff);
+ // TODO: use load instruction with fixed offset
+ return loadu(p + ioff);
+ }
+ realvec_t loada(const real_t *p, mask_t m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
}
- real_t minval() const
- {
- // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
- // vml_std::fmin((*this)[2], (*this)[3]));
- realvec_t x0123 = *this;
- realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
- realvec_t y0022 = x0123.fmin(x1032);
- return vml_std::fmin(y0022[0], y0022[2]);
+ }
+ realvec_t loadu(const real_t *p, mask_t m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
}
- real_t prod() const
- {
- // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
- realvec_t x = vec_xmul(v, v);
- return x[1] * x[3];
+ }
+ realvec_t loadu(const real_t *p, std::ptrdiff_t ioff, mask_t m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff, m);
+ // TODO: use load instruction with fixed offset
+ return loadu(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ vec_sta(v, 0, p);
+ }
+ void storeu(real_t *p) const {
+ // Vector stores would require vector loads, which would need to
+ // be atomic
+ // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html>
+ // for good ideas
+ p[0] = (*this)[0];
+ p[1] = (*this)[1];
+ p[2] = (*this)[2];
+ p[3] = (*this)[3];
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, mask_t m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ if (m.m[0])
+ p[0] = (*this)[0];
+ if (m.m[1])
+ p[1] = (*this)[1];
+ if (m.m[2])
+ p[2] = (*this)[2];
+ if (m.m[3])
+ p[3] = (*this)[3];
}
- real_t sum() const
- {
- // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
- realvec_t c1 = vec_logical(v, v, 0xf); // +1.0
- realvec_t x = vec_xxmadd(v, c1, v);
- return x[0] + x[2];
+ }
+ void storeu(real_t *p, mask_t m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ if (m.m[0])
+ p[0] = (*this)[0];
+ if (m.m[1])
+ p[1] = (*this)[1];
+ if (m.m[2])
+ p[2] = (*this)[2];
+ if (m.m[3])
+ p[3] = (*this)[3];
}
-
-
-
- boolvec_t operator==(realvec x) const { return vec_cmpeq(v, x.v); }
- boolvec_t operator!=(realvec x) const { return ! (*this == x); }
- boolvec_t operator<(realvec x) const { return vec_cmplt(v, x.v); }
- boolvec_t operator<=(realvec x) const
- {
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff, m);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return v; }
+ intvec_t convert_int() const { return vec_ctidz(v); }
+
+ realvec operator+() const { return *this; }
+ realvec operator-() const { return vec_neg(v); }
+
+ realvec operator+(realvec x) const { return vec_add(v, x.v); }
+ realvec operator-(realvec x) const { return vec_sub(v, x.v); }
+ realvec operator*(realvec x) const { return vec_mul(v, x.v); }
+ realvec operator/(realvec x) const {
+ // return vec_swdiv_nochk(v, x.v);
+ return div_fastd4(v, x.v);
+ }
+
+ realvec &operator+=(realvec x) { return *this = *this + x; }
+ realvec &operator-=(realvec x) { return *this = *this - x; }
+ realvec &operator*=(realvec x) { return *this = *this * x; }
+ realvec &operator/=(realvec x) { return *this = *this / x; }
+
+ real_t maxval() const {
+ // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+ // vml_std::fmax((*this)[2], (*this)[3]));
+ realvec_t x0123 = *this;
+ realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
+ realvec_t y0022 = x0123.fmax(x1032);
+ return vml_std::fmax(y0022[0], y0022[2]);
+ }
+ real_t minval() const {
+ // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+ // vml_std::fmin((*this)[2], (*this)[3]));
+ realvec_t x0123 = *this;
+ realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032));
+ realvec_t y0022 = x0123.fmin(x1032);
+ return vml_std::fmin(y0022[0], y0022[2]);
+ }
+ real_t prod() const {
+ // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+ realvec_t x = vec_xmul(v, v);
+ return x[1] * x[3];
+ }
+ real_t sum() const {
+ // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+ realvec_t c1 = vec_logical(v, v, 0xf); // +1.0
+ realvec_t x = vec_xxmadd(v, c1, v);
+ return x[0] + x[2];
+ }
+
+ boolvec_t operator==(realvec x) const { return vec_cmpeq(v, x.v); }
+ boolvec_t operator!=(realvec x) const { return !(*this == x); }
+ boolvec_t operator<(realvec x) const { return vec_cmplt(v, x.v); }
+ boolvec_t operator<=(realvec x) const {
#ifdef VML_HAVE_NAN
- return *this < x || *this == x;
+ return *this < x || *this == x;
#else
- return ! (*this > x);
+ return !(*this > x);
#endif
- }
- boolvec_t operator>(realvec x) const { return vec_cmpgt(v, x.v); }
- boolvec_t operator>=(realvec x) const
- {
+ }
+ boolvec_t operator>(realvec x) const { return vec_cmpgt(v, x.v); }
+ boolvec_t operator>=(realvec x) const {
#ifdef VML_HAVE_NAN
- return *this > x || *this == x;
+ return *this > x || *this == x;
#else
- return ! (*this < x);
+ return !(*this < x);
#endif
- }
-
-
-
- realvec acos() const { return acosd4(v); }
- realvec acosh() const { return acoshd4(v); }
- realvec asin() const { return asind4(v); }
- realvec asinh() const { return asinhd4(v); }
- realvec atan() const { return atand4(v); }
- realvec atan2(realvec y) const { return atan2d4(v, y.v); }
- realvec atanh() const { return atanhd4(v); }
- realvec cbrt() const { return cbrtd4(v); }
- realvec ceil() const { return vec_ceil(v); }
- realvec copysign(realvec y) const { return vec_cpsgn(y.v, v); }
- realvec cos() const { return cosd4(v); }
- realvec cosh() const { return coshd4(v); }
- realvec exp() const { return expd4(v); }
- realvec exp10() const { return exp10d4(v); }
- realvec exp2() const { return exp2d4(v); }
- realvec expm1() const { return expm1d4(v); }
- realvec fabs() const { return vec_abs(v); }
- realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
- realvec floor() const { return vec_floor(v); }
- realvec fma(realvec y, realvec z) const
- {
- return vec_madd(v, y.v, z.v);
- }
- realvec fmax(realvec y) const { return MF::vml_fmax(v, y.v); }
- realvec fmin(realvec y) const { return MF::vml_fmin(v, y.v); }
- realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
- realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
- realvec hypot(realvec y) const { return hypotd4(v, y.v); }
- intvec_t ilogb() const
- {
- // int_t ilogb_[] = {
- // ::ilogb((*this)[0]),
- // ::ilogb((*this)[1]),
- // ::ilogb((*this)[2]),
- // ::ilogb((*this)[3])
- // };
- // return intvec_t(ilogb_);
- return MF::vml_ilogb(v);
- }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const
- {
+ }
+
+ realvec acos() const { return acosd4(v); }
+ realvec acosh() const { return acoshd4(v); }
+ realvec asin() const { return asind4(v); }
+ realvec asinh() const { return asinhd4(v); }
+ realvec atan() const { return atand4(v); }
+ realvec atan2(realvec y) const { return atan2d4(v, y.v); }
+ realvec atanh() const { return atanhd4(v); }
+ realvec cbrt() const { return cbrtd4(v); }
+ realvec ceil() const { return vec_ceil(v); }
+ realvec copysign(realvec y) const { return vec_cpsgn(y.v, v); }
+ realvec cos() const { return cosd4(v); }
+ realvec cosh() const { return coshd4(v); }
+ realvec exp() const { return expd4(v); }
+ realvec exp10() const { return exp10d4(v); }
+ realvec exp2() const { return exp2d4(v); }
+ realvec expm1() const { return expm1d4(v); }
+ realvec fabs() const { return vec_abs(v); }
+ realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+ realvec floor() const { return vec_floor(v); }
+ realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
+ realvec fmax(realvec y) const { return MF::vml_fmax(v, y.v); }
+ realvec fmin(realvec y) const { return MF::vml_fmin(v, y.v); }
+ realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+ realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+ realvec hypot(realvec y) const { return hypotd4(v, y.v); }
+ intvec_t ilogb() const {
+ // int_t ilogb_[] = {
+ // ::ilogb((*this)[0]),
+ // ::ilogb((*this)[1]),
+ // ::ilogb((*this)[2]),
+ // ::ilogb((*this)[3])
+ // };
+ // return intvec_t(ilogb_);
+ return MF::vml_ilogb(v);
+ }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const {
#ifdef VML_HAVE_NAN
- return vec_tstnan(v, v);
+ return vec_tstnan(v, v);
#else
- return BV(false);
+ return BV(false);
#endif
- }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec ldexp(int_t n) const { return ldexp(intvec_t(n)); }
- realvec ldexp(intvec_t n) const
- {
- real_t ldexp_[] = {
- vml_std::ldexp((*this)[0], n[0]),
- vml_std::ldexp((*this)[1], n[1]),
- vml_std::ldexp((*this)[2], n[2]),
- vml_std::ldexp((*this)[3], n[3])
- };
- return realvec_t(ldexp_);
- }
- realvec log() const { return logd4(v); }
- realvec log10() const { return log10d4(v); }
- realvec log1p() const { return log1pd4(v); }
- realvec log2() const { return log2d4(v); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return MF::vml_mad(*this, y, z);
- }
- realvec nextafter(realvec y) const
- {
- return MF::vml_nextafter(*this, y);
- }
- realvec pow(realvec y) const { return powd4(v, y.v); }
- realvec rcp() const { return recip_fastd4(v); }
- realvec remainder(realvec y) const
- {
- return MF::vml_remainder(*this, y);
- }
- realvec rint() const
- {
- return MF::vml_rint(*this);
- // This is tempting, but seems too invasive
- // #ifdef VML_HAVE_FP_CONTRACT
- // return MF::vml_rint(*this);
- // #else
- // return vec_round(v); // use round instead of rint
- // #endif
- }
- realvec round() const { return vec_round(v); }
- realvec rsqrt() const
- {
- realvec x = *this;
- realvec r = vec_rsqrte(x.v); // this is only an approximation
- // TODO: use fma
- // two Newton iterations (see vml_rsqrt)
- r += RV(0.5)*r * (RV(1.0) - x * r*r);
- r += RV(0.5)*r * (RV(1.0) - x * r*r);
- return r;
- }
- boolvec_t signbit() const
- {
- return !RV(1.0).copysign(*this).as_int().as_bool();
- }
- realvec sin() const { return sind4(v); }
- realvec sinh() const { return sinhd4(v); }
- realvec sqrt() const
- {
- // return vec_sqrtsw_nochk(v);
- return *this * rsqrt();
- }
- realvec tan() const { return tand4(v); }
- realvec tanh() const { return tanhd4(v); }
- realvec trunc() const { return vec_trunc(v); }
- };
-
-
-
- // boolvec definitions
-
- inline intvec<double,4> boolvec<double,4>::as_int() const
- {
- return v;
- }
-
- inline intvec<double,4> boolvec<double,4>::convert_int() const
- {
- return ifthen(IV(I(1)), IV(I(0)));
- }
-
- inline
- boolvec<double,4>
- boolvec<double,4>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return ifthen(x.as_int(), y.as_int()).as_bool();
- }
-
- inline
- intvec<double,4>
- boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const
- {
- return ifthen(x.as_float(), y.as_float()).as_int();
- }
-
- inline
- realvec<double,4>
- boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const
- {
- return vec_sel(y.v, x.v, v);
- }
-
-
-
- // intvec definitions
-
- inline intvec<double,4> intvec<double,4>::abs() const
- {
- return MF::vml_abs(*this);
- }
-
- inline realvec<double,4> intvec<double,4>::as_float() const
- {
- return v;
- }
-
- inline intvec<double,4> intvec<double,4>::bitifthen(intvec_t x,
- intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
- inline intvec<double,4> intvec<double,4>::clz() const
- {
- return MF::vml_clz(*this);
- }
-
- inline realvec<double,4> intvec<double,4>::convert_float() const
- {
- return vec_cfid(v);
- }
-
- inline boolvec<double,4> intvec<double,4>::isignbit() const
- {
- return MF::vml_isignbit(*this);
- }
-
- inline intvec<double,4> intvec<double,4>::max(intvec_t x) const
- {
- return MF::vml_max(*this, x);
- }
-
- inline intvec<double,4> intvec<double,4>::min(intvec_t x) const
- {
- return MF::vml_min(*this, x);
- }
-
- inline intvec<double,4> intvec<double,4>::popcount() const
- {
- return MF::vml_popcount(*this);
- }
-
- inline intvec<double,4> intvec<double,4>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
- inline intvec<double,4> intvec<double,4>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
+ }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec ldexp(int_t n) const { return ldexp(intvec_t(n)); }
+ realvec ldexp(intvec_t n) const {
+ real_t ldexp_[] = {
+ vml_std::ldexp((*this)[0], n[0]), vml_std::ldexp((*this)[1], n[1]),
+ vml_std::ldexp((*this)[2], n[2]), vml_std::ldexp((*this)[3], n[3])};
+ return realvec_t(ldexp_);
+ }
+ realvec log() const { return logd4(v); }
+ realvec log10() const { return log10d4(v); }
+ realvec log1p() const { return log1pd4(v); }
+ realvec log2() const { return log2d4(v); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return MF::vml_mad(*this, y, z);
+ }
+ realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+ realvec pow(realvec y) const { return powd4(v, y.v); }
+ realvec rcp() const { return recip_fastd4(v); }
+ realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+ realvec rint() const {
+ return MF::vml_rint(*this);
+ // This is tempting, but seems too invasive
+ // #ifdef VML_HAVE_FP_CONTRACT
+ // return MF::vml_rint(*this);
+ // #else
+ // return vec_round(v); // use round instead of rint
+ // #endif
+ }
+ realvec round() const { return vec_round(v); }
+ realvec rsqrt() const {
+ realvec x = *this;
+ realvec r = vec_rsqrte(x.v); // this is only an approximation
+ // TODO: use fma
+ // two Newton iterations (see vml_rsqrt)
+ r += RV(0.5) * r * (RV(1.0) - x * r * r);
+ r += RV(0.5) * r * (RV(1.0) - x * r * r);
+ return r;
+ }
+ boolvec_t signbit() const {
+ return !RV(1.0).copysign(*this).as_int().as_bool();
+ }
+ realvec sin() const { return sind4(v); }
+ realvec sinh() const { return sinhd4(v); }
+ realvec sqrt() const {
+ // return vec_sqrtsw_nochk(v);
+ return *this * rsqrt();
+ }
+ realvec tan() const { return tand4(v); }
+ realvec tanh() const { return tanhd4(v); }
+ realvec trunc() const { return vec_trunc(v); }
+};
+
+// boolvec definitions
+
+inline intvec<double, 4> boolvec<double, 4>::as_int() const { return v; }
+
+inline intvec<double, 4> boolvec<double, 4>::convert_int() const {
+ return ifthen(IV(I(1)), IV(I(0)));
+}
+
+inline boolvec<double, 4> boolvec<double, 4>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<double, 4> boolvec<double, 4>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<double, 4> boolvec<double, 4>::ifthen(realvec_t x,
+ realvec_t y) const {
+ return vec_sel(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline intvec<double, 4> intvec<double, 4>::abs() const {
+ return MF::vml_abs(*this);
+}
+
+inline realvec<double, 4> intvec<double, 4>::as_float() const { return v; }
+
+inline intvec<double, 4> intvec<double, 4>::bitifthen(intvec_t x,
+ intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 4> intvec<double, 4>::clz() const {
+ return MF::vml_clz(*this);
+}
+
+inline realvec<double, 4> intvec<double, 4>::convert_float() const {
+ return vec_cfid(v);
+}
+
+inline boolvec<double, 4> intvec<double, 4>::isignbit() const {
+ return MF::vml_isignbit(*this);
+}
+
+inline intvec<double, 4> intvec<double, 4>::max(intvec_t x) const {
+ return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 4> intvec<double, 4>::min(intvec_t x) const {
+ return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 4> intvec<double, 4>::popcount() const {
+ return MF::vml_popcount(*this);
+}
+
+inline intvec<double, 4> intvec<double, 4>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 4> intvec<double, 4>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_QPX_DOUBLE4_H
+#endif // #ifndef VEC_QPX_DOUBLE4_H
diff --git a/vec_sse_double1.h b/vec_sse_double1.h
index 5558356..d727de8 100644
--- a/vec_sse_double1.h
+++ b/vec_sse_double1.h
@@ -12,589 +12,493 @@
// SSE2 intrinsics
#include <emmintrin.h>
-#ifdef __SSE3__ // Intel's SSE 3
-# include <pmmintrin.h>
+#ifdef __SSE3__ // Intel's SSE 3
+#include <pmmintrin.h>
#endif
-#ifdef __SSE4_1__ // Intel's SSE 4.1
-# include <smmintrin.h>
+#ifdef __SSE4_1__ // Intel's SSE 4.1
+#include <smmintrin.h>
#endif
-#ifdef __SSE4A__ // AMD's SSE 4a
-# include <ammintrin.h>
+#ifdef __SSE4A__ // AMD's SSE 4a
+#include <ammintrin.h>
#endif
-#if defined __AVX__ // Intel's AVX
-# include <immintrin.h>
+#if defined __AVX__ // Intel's AVX
+#include <immintrin.h>
#endif
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_DOUBLE_1
- template<> struct boolvec<double,1>;
- template<> struct intvec<double,1>;
- template<> struct realvec<double,1>;
-
-
-
- template<>
- struct boolvec<double,1>: floatprops<double>
- {
- static int const size = 1;
- typedef bool scalar_t;
- typedef uint_t bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- // true values are non-zero, false values are zero
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(boolvec const& x): v(x.v) {}
- // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a): v(a) {}
- boolvec(bool const* as): v(as[0]) {}
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const { return v; }
- boolvec_t& set_elt(int n, bool a) { return v=a, *this; }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec_t operator!() const { return !v; }
-
- boolvec_t operator&&(boolvec_t x) const { return v && x.v; }
- boolvec_t operator||(boolvec_t x) const { return v || x.v; }
- boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); }
- boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); }
-
- bool all() const { return *this; }
- bool any() const { return *this; }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<double,1>: floatprops<double>
- {
- static int const size = 1;
- typedef int_t scalar_t;
- typedef int_t ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(intvec const& x): v(x.v) {}
- // intvec& operator=(intvec const& x) { return v=x.v, *this; }
- intvec(int_t a): v(a) {}
- intvec(int_t const* as): v(as[0]) {}
- static intvec_t iota() { return intvec(I(0)); }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const { return v; }
- intvec_t& set_elt(int n, int_t a) { return v=a, *this; }
-
-
-
- boolvec_t as_bool() const { return U(v); }
- boolvec_t convert_bool() const { return bool(v); }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- intvec_t operator+() const { return +v; }
- intvec_t operator-() const { return -v; }
-
- intvec_t operator+(intvec_t x) const { return v+x.v; }
- intvec_t operator-(intvec_t x) const { return v-x.v; }
- intvec_t operator*(intvec_t x) const { return v*x.v; }
- intvec_t operator/(intvec_t x) const { return v/x.v; }
- intvec_t operator%(intvec_t x) const { return v%x.v; }
-
- intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
- intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
- intvec_t& operator*=(intvec_t const& x) { return *this=*this*x; }
- intvec_t& operator/=(intvec_t const& x) { return *this=*this/x; }
- intvec_t& operator%=(intvec_t const& x) { return *this=*this%x; }
-
-
-
- intvec_t operator~() const { return ~v; }
-
- intvec_t operator&(intvec_t x) const { return v&x.v; }
- intvec_t operator|(intvec_t x) const { return v|x.v; }
- intvec_t operator^(intvec_t x) const { return v^x.v; }
-
- intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
- intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
- intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const;
-
-
-
- intvec_t lsr(int_t n) const { return U(v) >> U(n); }
- intvec_t rotate(int_t n) const;
- intvec_t operator>>(int_t n) const { return v>>n; }
- intvec_t operator<<(int_t n) const { return v<<n; }
-
- intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec_t lsr(intvec_t n) const { return U(v) >> U(n); }
- intvec_t rotate(intvec_t n) const;
- intvec_t operator>>(intvec_t n) const { return v>>n; }
- intvec_t operator<<(intvec_t n) const { return v<<n; }
-
- intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-
- intvec_t clz() const { return __builtin_clzll(v); }
- intvec_t popcount() const { return __builtin_popcountll(v); }
-
-
-
- boolvec_t operator==(intvec_t const& x) const { return v==x.v; }
- boolvec_t operator!=(intvec_t const& x) const { return v!=x.v; }
- boolvec_t operator<(intvec_t const& x) const { return v<x.v; }
- boolvec_t operator<=(intvec_t const& x) const { return v<=x.v; }
- boolvec_t operator>(intvec_t const& x) const { return v>x.v; }
- boolvec_t operator>=(intvec_t const& x) const { return v>=x.v; }
-
- intvec_t abs() const { return std::abs(v); }
- boolvec_t isignbit() const { return v<0; }
- intvec_t max(intvec_t x) const { return std::max(v, x.v); }
- intvec_t min(intvec_t x) const { return std::min(v, x.v); }
- };
-
-
-
- template<>
- struct realvec<double,1>: floatprops<double>
- {
- static int const size = 1;
- typedef real_t scalar_t;
- typedef double vector_t;
- static int const alignment = sizeof(vector_t);
-
- static char const* name() { return "<SSE2:1*double>"; }
- void barrier() { __asm__("": "+x"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- private:
- static __m128d from_double(double a) { return _mm_set_sd(a); }
- static double to_double(__m128d a) { return _mm_cvtsd_f64(a); }
- public:
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(realvec const& x): v(x.v) {}
- // realvec& operator=(realvec const& x) { return v=x.v, *this; }
- realvec(real_t a): v(a) {}
- realvec(real_t const* as): v(as[0]) {}
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const { return v; }
- realvec_t& set_elt(int n, real_t a) { return v=a, *this; }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return *p;
- }
- static realvec_t loadu(real_t const* p)
- {
- return *p;
- }
- static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return loada(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return *this;
- }
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return *this;
- }
- }
- realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return loada(p+ioff, m);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- *p = v;
- }
- void storeu(real_t* p) const
- {
- *p = v;
- }
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storea(p+ioff);
+template <> struct boolvec<double, 1>;
+template <> struct intvec<double, 1>;
+template <> struct realvec<double, 1>;
+
+template <> struct boolvec<double, 1> : floatprops<double> {
+ static int const size = 1;
+ typedef bool scalar_t;
+ typedef uint_t bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+ // true values are non-zero, false values are zero
+
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v(a) {}
+ boolvec(bool const *as) : v(as[0]) {}
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const { return v; }
+ boolvec_t &set_elt(int n, bool a) { return v = a, *this; }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec_t operator!() const { return !v; }
+
+ boolvec_t operator&&(boolvec_t x) const { return v && x.v; }
+ boolvec_t operator||(boolvec_t x) const { return v || x.v; }
+ boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); }
+ boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); }
+
+ bool all() const { return *this; }
+ bool any() const { return *this; }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 1> : floatprops<double> {
+ static int const size = 1;
+ typedef int_t scalar_t;
+ typedef int_t ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(int_t a) : v(a) {}
+ intvec(int_t const *as) : v(as[0]) {}
+ static intvec_t iota() { return intvec(I(0)); }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const { return v; }
+ intvec_t &set_elt(int n, int_t a) { return v = a, *this; }
+
+ boolvec_t as_bool() const { return U(v); }
+ boolvec_t convert_bool() const { return bool(v); }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ intvec_t operator+() const { return +v; }
+ intvec_t operator-() const { return -v; }
+
+ intvec_t operator+(intvec_t x) const { return v + x.v; }
+ intvec_t operator-(intvec_t x) const { return v - x.v; }
+ intvec_t operator*(intvec_t x) const { return v * x.v; }
+ intvec_t operator/(intvec_t x) const { return v / x.v; }
+ intvec_t operator%(intvec_t x) const { return v % x.v; }
+
+ intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+ intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+ intvec_t &operator*=(intvec_t const &x) { return *this = *this * x; }
+ intvec_t &operator/=(intvec_t const &x) { return *this = *this / x; }
+ intvec_t &operator%=(intvec_t const &x) { return *this = *this % x; }
+
+ intvec_t operator~() const { return ~v; }
+
+ intvec_t operator&(intvec_t x) const { return v & x.v; }
+ intvec_t operator|(intvec_t x) const { return v | x.v; }
+ intvec_t operator^(intvec_t x) const { return v ^ x.v; }
+
+ intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+ intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+ intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+ intvec_t lsr(int_t n) const { return U(v) >> U(n); }
+ intvec_t rotate(int_t n) const;
+ intvec_t operator>>(int_t n) const { return v >> n; }
+ intvec_t operator<<(int_t n) const { return v << n; }
+
+ intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec_t lsr(intvec_t n) const { return U(v) >> U(n); }
+ intvec_t rotate(intvec_t n) const;
+ intvec_t operator>>(intvec_t n) const { return v >> n; }
+ intvec_t operator<<(intvec_t n) const { return v << n; }
+
+ intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+ intvec_t clz() const { return __builtin_clzll(v); }
+ intvec_t popcount() const { return __builtin_popcountll(v); }
+
+ boolvec_t operator==(intvec_t const &x) const { return v == x.v; }
+ boolvec_t operator!=(intvec_t const &x) const { return v != x.v; }
+ boolvec_t operator<(intvec_t const &x) const { return v < x.v; }
+ boolvec_t operator<=(intvec_t const &x) const { return v <= x.v; }
+ boolvec_t operator>(intvec_t const &x) const { return v > x.v; }
+ boolvec_t operator>=(intvec_t const &x) const { return v >= x.v; }
+
+ intvec_t abs() const { return std::abs(v); }
+ boolvec_t isignbit() const { return v < 0; }
+ intvec_t max(intvec_t x) const { return std::max(v, x.v); }
+ intvec_t min(intvec_t x) const { return std::min(v, x.v); }
+};
+
+template <> struct realvec<double, 1> : floatprops<double> {
+ static int const size = 1;
+ typedef real_t scalar_t;
+ typedef double vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static char const *name() { return "<SSE2:1*double>"; }
+ void barrier() { __asm__("" : "+x"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+private:
+ static __m128d from_double(double a) { return _mm_set_sd(a); }
+ static double to_double(__m128d a) { return _mm_cvtsd_f64(a); }
+
+public:
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(real_t a) : v(a) {}
+ realvec(real_t const *as) : v(as[0]) {}
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const { return v; }
+ realvec_t &set_elt(int n, real_t a) { return v = a, *this; }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return *p;
+ }
+ static realvec_t loadu(real_t const *p) { return *p; }
+ static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return loada(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return *this;
}
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- }
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return *this;
}
- void storeu(real_t* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- }
+ }
+ realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return loada(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ *p = v;
+ }
+ void storeu(real_t *p) const { *p = v; }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storea(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
}
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storea(p+ioff, m);
+ }
+ void storeu(real_t *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
}
-
-
-
- intvec_t as_int() const { return floatprops::as_int(v); }
- intvec_t convert_int() const {
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storea(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return floatprops::as_int(v); }
+ intvec_t convert_int() const {
#ifdef __x86_64__
- return _mm_cvttsd_si64(_mm_set_sd(v));
+ return _mm_cvttsd_si64(_mm_set_sd(v));
#else
- return floatprops::convert_int(v);
+ return floatprops::convert_int(v);
#endif
- }
-
-
-
- realvec_t operator+() const { return +v; }
- realvec_t operator-() const { return -v; }
-
- realvec_t operator+(realvec_t x) const { return v+x.v; }
- realvec_t operator-(realvec_t x) const { return v-x.v; }
- realvec_t operator*(realvec_t x) const { return v*x.v; }
- realvec_t operator/(realvec_t x) const { return v/x.v; }
-
- realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
- realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
- realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
- realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-
- real_t maxval() const { return *this; }
- real_t minval() const { return *this; }
- real_t prod() const { return *this; }
- real_t sum() const { return *this; }
-
-
-
- boolvec_t operator==(realvec_t const& x) const { return v==x.v; }
- boolvec_t operator!=(realvec_t const& x) const { return v!=x.v; }
- boolvec_t operator<(realvec_t const& x) const { return v<x.v; }
- boolvec_t operator<=(realvec_t const& x) const { return v<=x.v; }
- boolvec_t operator>(realvec_t const& x) const { return v>x.v; }
- boolvec_t operator>=(realvec_t const& x) const { return v>=x.v; }
-
-
-
- realvec_t acos() const { return MF::vml_acos(*this); }
- realvec_t acosh() const { return MF::vml_acosh(*this); }
- realvec_t asin() const { return MF::vml_asin(*this); }
- realvec_t asinh() const { return MF::vml_asinh(*this); }
- realvec_t atan() const { return MF::vml_atan(*this); }
- realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
- realvec_t atanh() const { return MF::vml_atanh(*this); }
- realvec_t cbrt() const { return MF::vml_cbrt(*this); }
- realvec_t ceil() const
- {
+ }
+
+ realvec_t operator+() const { return +v; }
+ realvec_t operator-() const { return -v; }
+
+ realvec_t operator+(realvec_t x) const { return v + x.v; }
+ realvec_t operator-(realvec_t x) const { return v - x.v; }
+ realvec_t operator*(realvec_t x) const { return v * x.v; }
+ realvec_t operator/(realvec_t x) const { return v / x.v; }
+
+ realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+ realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+ realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+ realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+ real_t maxval() const { return *this; }
+ real_t minval() const { return *this; }
+ real_t prod() const { return *this; }
+ real_t sum() const { return *this; }
+
+ boolvec_t operator==(realvec_t const &x) const { return v == x.v; }
+ boolvec_t operator!=(realvec_t const &x) const { return v != x.v; }
+ boolvec_t operator<(realvec_t const &x) const { return v < x.v; }
+ boolvec_t operator<=(realvec_t const &x) const { return v <= x.v; }
+ boolvec_t operator>(realvec_t const &x) const { return v > x.v; }
+ boolvec_t operator>=(realvec_t const &x) const { return v >= x.v; }
+
+ realvec_t acos() const { return MF::vml_acos(*this); }
+ realvec_t acosh() const { return MF::vml_acosh(*this); }
+ realvec_t asin() const { return MF::vml_asin(*this); }
+ realvec_t asinh() const { return MF::vml_asinh(*this); }
+ realvec_t atan() const { return MF::vml_atan(*this); }
+ realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+ realvec_t atanh() const { return MF::vml_atanh(*this); }
+ realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+ realvec_t ceil() const {
#ifdef __SSE4_1__
- return to_double(_mm_ceil_sd(from_double(v), from_double(v)));
+ return to_double(_mm_ceil_sd(from_double(v), from_double(v)));
#else
- return vml_std::ceil(v);
+ return vml_std::ceil(v);
#endif
- }
- realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); }
- realvec_t cos() const { return MF::vml_cos(*this); }
- realvec_t cosh() const { return MF::vml_cosh(*this); }
- realvec_t exp() const { return MF::vml_exp(*this); }
- realvec_t exp10() const { return MF::vml_exp10(*this); }
- realvec_t exp2() const { return MF::vml_exp2(*this); }
- realvec_t expm1() const { return MF::vml_expm1(*this); }
- realvec_t fabs() const { return vml_std::fabs(v); }
- realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
- realvec_t floor() const
- {
+ }
+ realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); }
+ realvec_t cos() const { return MF::vml_cos(*this); }
+ realvec_t cosh() const { return MF::vml_cosh(*this); }
+ realvec_t exp() const { return MF::vml_exp(*this); }
+ realvec_t exp10() const { return MF::vml_exp10(*this); }
+ realvec_t exp2() const { return MF::vml_exp2(*this); }
+ realvec_t expm1() const { return MF::vml_expm1(*this); }
+ realvec_t fabs() const { return vml_std::fabs(v); }
+ realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+ realvec_t floor() const {
#ifdef __SSE4_1__
- return to_double(_mm_floor_sd(from_double(v), from_double(v)));
+ return to_double(_mm_floor_sd(from_double(v), from_double(v)));
#else
- return vml_std::floor(v);
+ return vml_std::floor(v);
#endif
- }
- realvec_t fma(realvec_t y, realvec_t z) const
- {
- return MF::vml_fma(*this, y, z);
- }
- realvec_t fmax(realvec_t y) const
- {
- return to_double(_mm_max_sd(from_double(v), from_double(y.v)));
- }
- realvec_t fmin(realvec_t y) const
- {
- return to_double(_mm_min_sd(from_double(v), from_double(y.v)));
- }
- realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
- realvec_t frexp(intvec_t* irp) const
- {
- int iri;
- realvec_t r = vml_std::frexp(v, &iri);
- int_t ir = iri;
- if (isinf()) ir = std::numeric_limits<int_t>::max();
- if (isnan()) ir = std::numeric_limits<int_t>::min();
- irp->v = ir;
- return r;
- }
- realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
- intvec_t ilogb() const
- {
- int_t r = vml_std::ilogb(v);
- typedef std::numeric_limits<int_t> NL;
- if (FP_ILOGB0 != NL::min() and v == R(0.0)) {
- r = NL::min();
+ }
+ realvec_t fma(realvec_t y, realvec_t z) const {
+ return MF::vml_fma(*this, y, z);
+ }
+ realvec_t fmax(realvec_t y) const {
+ return to_double(_mm_max_sd(from_double(v), from_double(y.v)));
+ }
+ realvec_t fmin(realvec_t y) const {
+ return to_double(_mm_min_sd(from_double(v), from_double(y.v)));
+ }
+ realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
+ realvec_t frexp(intvec_t *irp) const {
+ int iri;
+ realvec_t r = vml_std::frexp(v, &iri);
+ int_t ir = iri;
+ if (isinf())
+ ir = std::numeric_limits<int_t>::max();
+ if (isnan())
+ ir = std::numeric_limits<int_t>::min();
+ irp->v = ir;
+ return r;
+ }
+ realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+ intvec_t ilogb() const {
+ int_t r = vml_std::ilogb(v);
+ typedef std::numeric_limits<int_t> NL;
+ if (FP_ILOGB0 != NL::min() and v == R(0.0)) {
+ r = NL::min();
#if defined VML_HAVE_INF
- } else if (INT_MAX != NL::max() and vml_std::isinf(v)) {
- r = NL::max();
+ } else if (INT_MAX != NL::max() and vml_std::isinf(v)) {
+ r = NL::max();
#endif
#if defined VML_HAVE_NAN
- } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v)) {
- r = NL::min();
+ } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v)) {
+ r = NL::min();
#endif
- }
- return r;
- }
- boolvec_t isfinite() const { return vml_std::isfinite(v); }
- boolvec_t isinf() const { return vml_std::isinf(v); }
- boolvec_t isnan() const
- {
- // This is wrong:
- // return _mm_ucomineq_sd(from_double(v), from_double(v));
- // This works:
- // char r;
- // __asm__("ucomisd %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v));
- // return boolvec_t::scalar_t(r);
- // This works as well:
- return vml_std::isnan(v);
- }
- boolvec_t isnormal() const { return vml_std::isnormal(v); }
- realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); }
- realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); }
- realvec_t log() const { return MF::vml_log(*this); }
- realvec_t log10() const { return MF::vml_log10(*this); }
- realvec_t log1p() const { return MF::vml_log1p(*this); }
- realvec_t log2() const { return MF::vml_log2(*this); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return MF::vml_mad(*this, y, z);
- }
- realvec_t nextafter(realvec_t y) const
- {
- return MF::vml_nextafter(*this, y);
- }
- realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
- realvec_t rcp() const { return R(1.0)/v; }
- realvec_t remainder(realvec_t y) const
- {
- return vml_std::remainder(v, y.v);
}
- realvec_t rint() const
- {
+ return r;
+ }
+ boolvec_t isfinite() const { return vml_std::isfinite(v); }
+ boolvec_t isinf() const { return vml_std::isinf(v); }
+ boolvec_t isnan() const {
+ // This is wrong:
+ // return _mm_ucomineq_sd(from_double(v), from_double(v));
+ // This works:
+ // char r;
+ // __asm__("ucomisd %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v));
+ // return boolvec_t::scalar_t(r);
+ // This works as well:
+ return vml_std::isnan(v);
+ }
+ boolvec_t isnormal() const { return vml_std::isnormal(v); }
+ realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); }
+ realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); }
+ realvec_t log() const { return MF::vml_log(*this); }
+ realvec_t log10() const { return MF::vml_log10(*this); }
+ realvec_t log1p() const { return MF::vml_log1p(*this); }
+ realvec_t log2() const { return MF::vml_log2(*this); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return MF::vml_mad(*this, y, z);
+ }
+ realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+ realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+ realvec_t rcp() const { return R(1.0) / v; }
+ realvec_t remainder(realvec_t y) const { return vml_std::remainder(v, y.v); }
+ realvec_t rint() const {
#ifdef __SSE4_1__
- return to_double(_mm_round_sd(from_double(v), from_double(v),
- _MM_FROUND_TO_NEAREST_INT));
+ return to_double(_mm_round_sd(from_double(v), from_double(v),
+ _MM_FROUND_TO_NEAREST_INT));
#else
- return MF::vml_rint(*this);
+ return MF::vml_rint(*this);
#endif
- }
- realvec_t round() const { return MF::vml_round(*this); }
- realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
- boolvec_t signbit() const { return vml_std::signbit(v); }
- realvec_t sin() const { return MF::vml_sin(*this); }
- realvec_t sinh() const { return MF::vml_sinh(*this); }
- realvec_t sqrt() const
- {
- return to_double(_mm_sqrt_sd(from_double(v), from_double(v)));
- }
- realvec_t tan() const { return MF::vml_tan(*this); }
- realvec_t tanh() const { return MF::vml_tanh(*this); }
- realvec_t trunc() const
- {
+ }
+ realvec_t round() const { return MF::vml_round(*this); }
+ realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+ boolvec_t signbit() const { return vml_std::signbit(v); }
+ realvec_t sin() const { return MF::vml_sin(*this); }
+ realvec_t sinh() const { return MF::vml_sinh(*this); }
+ realvec_t sqrt() const {
+ return to_double(_mm_sqrt_sd(from_double(v), from_double(v)));
+ }
+ realvec_t tan() const { return MF::vml_tan(*this); }
+ realvec_t tanh() const { return MF::vml_tanh(*this); }
+ realvec_t trunc() const {
#ifdef __SSE4_1__
- return to_double(_mm_round_sd(from_double(v), from_double(v),
- _MM_FROUND_TO_ZERO));
+ return to_double(
+ _mm_round_sd(from_double(v), from_double(v), _MM_FROUND_TO_ZERO));
#else
- return MF::vml_trunc(*this);
+ return MF::vml_trunc(*this);
#endif
- }
- };
-
-
-
- // boolvec definitions
-
- inline intvec<double,1> boolvec<double,1>::as_int() const
- {
- return I(v);
}
-
- inline intvec<double,1> boolvec<double,1>::convert_int() const
- {
- return v;
- }
-
- inline
- boolvec<double,1> boolvec<double,1>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return v ? x : y;
- }
-
- inline
- intvec<double,1> boolvec<double,1>::ifthen(intvec_t x, intvec_t y) const
- {
- return v ? x : y;
- }
-
- inline
- realvec<double,1> boolvec<double,1>::ifthen(realvec_t x, realvec_t y) const
- {
- return v ? x : y;
- }
-
-
-
- // intvec definitions
-
- inline realvec<double,1> intvec<double,1>::as_float() const
- {
- return FP::as_float(v);
- }
-
- inline realvec<double,1> intvec<double,1>::convert_float() const
- {
+};
+
+// boolvec definitions
+
+inline intvec<double, 1> boolvec<double, 1>::as_int() const { return I(v); }
+
+inline intvec<double, 1> boolvec<double, 1>::convert_int() const { return v; }
+
+inline boolvec<double, 1> boolvec<double, 1>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return v ? x : y;
+}
+
+inline intvec<double, 1> boolvec<double, 1>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return v ? x : y;
+}
+
+inline realvec<double, 1> boolvec<double, 1>::ifthen(realvec_t x,
+ realvec_t y) const {
+ return v ? x : y;
+}
+
+// intvec definitions
+
+inline realvec<double, 1> intvec<double, 1>::as_float() const {
+ return FP::as_float(v);
+}
+
+inline realvec<double, 1> intvec<double, 1>::convert_float() const {
#ifdef __x86_64__
- return _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(), v));
+ return _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(), v));
#else
- return FP::convert_float(v);
+ return FP::convert_float(v);
#endif
- }
-
- inline intvec<double,1> intvec<double,1>::bitifthen(intvec_t x,
- intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
- inline intvec<double,1> intvec<double,1>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
- inline intvec<double,1> intvec<double,1>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
+}
+
+inline intvec<double, 1> intvec<double, 1>::bitifthen(intvec_t x,
+ intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 1> intvec<double, 1>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 1> intvec<double, 1>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_SSE_DOUBLE1_H
+#endif // #ifndef VEC_SSE_DOUBLE1_H
diff --git a/vec_sse_double2.h b/vec_sse_double2.h
index 5d64688..095f458 100644
--- a/vec_sse_double2.h
+++ b/vec_sse_double2.h
@@ -11,737 +11,600 @@
// SSE2 intrinsics
#include <emmintrin.h>
-#ifdef __SSE3__ // Intel's SSE 3
-# include <pmmintrin.h>
+#ifdef __SSE3__ // Intel's SSE 3
+#include <pmmintrin.h>
#endif
-#ifdef __SSE4_1__ // Intel's SSE 4.1
-# include <smmintrin.h>
+#ifdef __SSE4_1__ // Intel's SSE 4.1
+#include <smmintrin.h>
#endif
-#ifdef __SSE4A__ // AMD's SSE 4a
-# include <ammintrin.h>
+#ifdef __SSE4A__ // AMD's SSE 4a
+#include <ammintrin.h>
#endif
-#if defined __AVX__ // Intel's AVX
-# include <immintrin.h>
+#if defined __AVX__ // Intel's AVX
+#include <immintrin.h>
#endif
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_DOUBLE_2
- template<> struct boolvec<double,2>;
- template<> struct intvec<double,2>;
- template<> struct realvec<double,2>;
-
-
-
- template<>
- struct boolvec<double,2>: floatprops<double>
- {
- static int const size = 2;
- typedef bool scalar_t;
- typedef __m128d bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- private:
- // true values have the sign bit set, false values have it unset
- static uint_t from_bool(bool a) { return - uint_t(a); }
- static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
- public:
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(boolvec const& x): v(x.v) {}
- // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a):
- v(_mm_castsi128_pd(_mm_set1_epi64x(from_bool(a)))) {}
- boolvec(bool const* as):
- v(_mm_castsi128_pd(_mm_set_epi64x(from_bool(as[1]), from_bool(as[0])))) {}
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const
- {
- return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
- }
- boolvec_t& set_elt(int n, bool a)
- {
- return
- vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
- }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec_t operator!() const { return _mm_xor_pd(boolvec(true), v); }
-
- boolvec_t operator&&(boolvec_t x) const { return _mm_and_pd(v, x.v); }
- boolvec_t operator||(boolvec_t x) const { return _mm_or_pd(v, x.v); }
- boolvec_t operator==(boolvec_t x) const { return !(*this!=x); }
- boolvec_t operator!=(boolvec_t x) const { return _mm_xor_pd(v, x.v); }
-
- bool all() const
- {
+template <> struct boolvec<double, 2>;
+template <> struct intvec<double, 2>;
+template <> struct realvec<double, 2>;
+
+template <> struct boolvec<double, 2> : floatprops<double> {
+ static int const size = 2;
+ typedef bool scalar_t;
+ typedef __m128d bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+private:
+ // true values have the sign bit set, false values have it unset
+ static uint_t from_bool(bool a) { return -uint_t(a); }
+ static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v(_mm_castsi128_pd(_mm_set1_epi64x(from_bool(a)))) {}
+ boolvec(bool const *as)
+ : v(_mm_castsi128_pd(
+ _mm_set_epi64x(from_bool(as[1]), from_bool(as[0])))) {}
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const {
+ return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+ }
+ boolvec_t &set_elt(int n, bool a) {
+ return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+ *this;
+ }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec_t operator!() const { return _mm_xor_pd(boolvec(true), v); }
+
+ boolvec_t operator&&(boolvec_t x) const { return _mm_and_pd(v, x.v); }
+ boolvec_t operator||(boolvec_t x) const { return _mm_or_pd(v, x.v); }
+ boolvec_t operator==(boolvec_t x) const { return !(*this != x); }
+ boolvec_t operator!=(boolvec_t x) const { return _mm_xor_pd(v, x.v); }
+
+ bool all() const {
#if defined __AVX__
- return ! (! *this).any();
+ return !(!*this).any();
#else
- return (*this)[0] && (*this)[1];
+ return (*this)[0] && (*this)[1];
#endif
- }
- bool any() const
- {
+ }
+ bool any() const {
#if defined __AVX__
- return ! bool(_mm_testz_pd(v, v));
+ return !bool(_mm_testz_pd(v, v));
#else
- return (*this)[0] || (*this)[1];
+ return (*this)[0] || (*this)[1];
#endif
+ }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 2> : floatprops<double> {
+ static int const size = 2;
+ typedef int_t scalar_t;
+ typedef __m128i ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x) : v(x) {}
+ intvec(int_t a) : v(_mm_set1_epi64x(a)) {}
+ intvec(int_t const *as) : v(_mm_set_epi64x(as[1], as[0])) {}
+ static intvec_t iota() { return _mm_set_epi64x(1, 0); }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const {
+ return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+ }
+ intvec_t &set_elt(int n, int_t a) {
+ return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+ }
+
+ boolvec_t as_bool() const { return _mm_castsi128_pd(v); }
+ boolvec_t convert_bool() const {
+ // Result: convert_bool(0)=false, convert_bool(else)=true
+ // There is no intrinsic to compare to zero. Instead, we check
+ // whether x is positive and x-1 is negative.
+ intvec_t x = *this;
+ // We know that boolvec_t values depend only on the sign bit
+ // return (~(x-1) | x).as_bool();
+ // return x.as_bool() || !(x-1).as_bool();
+ return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+ }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ // Note: not all arithmetic operations are supported!
+
+ intvec_t operator+() const { return *this; }
+ intvec_t operator-() const { return IV(I(0)) - *this; }
+
+ intvec_t operator+(intvec_t x) const { return _mm_add_epi64(v, x.v); }
+ intvec_t operator-(intvec_t x) const { return _mm_sub_epi64(v, x.v); }
+
+ intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+ intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+
+ intvec_t operator~() const { return IV(~U(0)) ^ *this; }
+
+ intvec_t operator&(intvec_t x) const {
+ return _mm_castpd_si128(
+ _mm_and_pd(_mm_castsi128_pd(v), _mm_castsi128_pd(x.v)));
+ }
+ intvec_t operator|(intvec_t x) const {
+ return _mm_castpd_si128(
+ _mm_or_pd(_mm_castsi128_pd(v), _mm_castsi128_pd(x.v)));
+ }
+ intvec_t operator^(intvec_t x) const {
+ return _mm_castpd_si128(
+ _mm_xor_pd(_mm_castsi128_pd(v), _mm_castsi128_pd(x.v)));
+ }
+
+ intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+ intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+ intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+ intvec_t lsr(int_t n) const { return _mm_srli_epi64(v, n); }
+ intvec_t rotate(int_t n) const;
+ intvec_t operator>>(int_t n) const {
+ // There is no _mm_srai_epi64. To emulate it, add 0x80000000
+ // before shifting, and subtract the shifted 0x80000000 after
+ // shifting
+ intvec_t x = *this;
+ // Convert signed to unsiged
+ x += U(1) << (bits - 1);
+ // Shift
+ x = x.lsr(n);
+ // Undo conversion
+ x -= U(1) << (bits - 1 - n);
+ return x;
+ }
+ intvec_t operator<<(int_t n) const { return _mm_slli_epi64(v, n); }
+ intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec_t lsr(intvec_t n) const {
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, U((*this)[i]) >> U(n[i]));
}
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<double,2>: floatprops<double>
- {
- static int const size = 2;
- typedef int_t scalar_t;
- typedef __m128i ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(intvec const& x): v(x.v) {}
- // intvec& operator=(intvec const& x) { return v=x.v, *this; }
- intvec(ivector_t x): v(x) {}
- intvec(int_t a): v(_mm_set1_epi64x(a)) {}
- intvec(int_t const* as): v(_mm_set_epi64x(as[1], as[0])) {}
- static intvec_t iota() { return _mm_set_epi64x(1, 0); }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const
- {
- return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
- }
- intvec_t& set_elt(int n, int_t a)
- {
- return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
- }
-
-
-
- boolvec_t as_bool() const { return _mm_castsi128_pd(v); }
- boolvec_t convert_bool() const
- {
- // Result: convert_bool(0)=false, convert_bool(else)=true
- // There is no intrinsic to compare to zero. Instead, we check
- // whether x is positive and x-1 is negative.
- intvec_t x = *this;
- // We know that boolvec_t values depend only on the sign bit
- // return (~(x-1) | x).as_bool();
- // return x.as_bool() || !(x-1).as_bool();
- return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
- }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- // Note: not all arithmetic operations are supported!
-
- intvec_t operator+() const { return *this; }
- intvec_t operator-() const { return IV(I(0)) - *this; }
-
- intvec_t operator+(intvec_t x) const { return _mm_add_epi64(v, x.v); }
- intvec_t operator-(intvec_t x) const { return _mm_sub_epi64(v, x.v); }
-
- intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
- intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-
-
-
- intvec_t operator~() const { return IV(~U(0)) ^ *this; }
-
- intvec_t operator&(intvec_t x) const
- {
- return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(v),
- _mm_castsi128_pd(x.v)));
- }
- intvec_t operator|(intvec_t x) const
- {
- return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(v),
- _mm_castsi128_pd(x.v)));
- }
- intvec_t operator^(intvec_t x) const
- {
- return _mm_castpd_si128(_mm_xor_pd(_mm_castsi128_pd(v),
- _mm_castsi128_pd(x.v)));
- }
-
- intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
- intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
- intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const;
-
-
-
- intvec_t lsr(int_t n) const { return _mm_srli_epi64(v, n); }
- intvec_t rotate(int_t n) const;
- intvec_t operator>>(int_t n) const
- {
- // There is no _mm_srai_epi64. To emulate it, add 0x80000000
- // before shifting, and subtract the shifted 0x80000000 after
- // shifting
- intvec_t x = *this;
- // Convert signed to unsiged
- x += U(1) << (bits-1);
- // Shift
- x = x.lsr(n);
- // Undo conversion
- x -= U(1) << (bits-1-n);
- return x;
- }
- intvec_t operator<<(int_t n) const { return _mm_slli_epi64(v, n); }
- intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec_t lsr(intvec_t n) const
- {
- intvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, U((*this)[i]) >> U(n[i]));
- }
- return r;
- }
- intvec_t rotate(intvec_t n) const;
- intvec_t operator>>(intvec_t n) const
- {
- intvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] >> n[i]);
- }
- return r;
- }
- intvec_t operator<<(intvec_t n) const
- {
- intvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] << n[i]);
- }
- return r;
- }
- intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-
- intvec_t clz() const;
- intvec_t popcount() const;
-
-
-
- boolvec_t operator==(intvec_t const& x) const
- {
- return ! (*this != x);
- }
- boolvec_t operator!=(intvec_t const& x) const
- {
- return (*this ^ x).convert_bool();
- }
- boolvec_t operator<(intvec_t const& x) const
- {
- // return (*this - x).as_bool();
- boolvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] < x[i]);
- }
- return r;
- }
- boolvec_t operator<=(intvec_t const& x) const
- {
- return ! (*this > x);
- }
- boolvec_t operator>(intvec_t const& x) const
- {
- return x < *this;
- }
- boolvec_t operator>=(intvec_t const& x) const
- {
- return ! (*this < x);
- }
-
- intvec_t abs() const;
- boolvec_t isignbit() const { return as_bool(); }
- intvec_t max(intvec_t x) const;
- intvec_t min(intvec_t x) const;
- };
-
-
-
- template<>
- struct realvec<double,2>: floatprops<double>
- {
- static int const size = 2;
- typedef real_t scalar_t;
- typedef __m128d vector_t;
- static int const alignment = sizeof(vector_t);
-
- static char const* name() { return "<SSE2:2*double>"; }
- void barrier() { __asm__("": "+x"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(realvec const& x): v(x.v) {}
- // realvec& operator=(realvec const& x) { return v=x.v, *this; }
- realvec(vector_t x): v(x) {}
- realvec(real_t a): v(_mm_set1_pd(a)) {}
- realvec(real_t const* as): v(_mm_set_pd(as[1], as[0])) {}
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const
- {
- return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
- }
- realvec_t& set_elt(int n, real_t a)
- {
- return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
- }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return _mm_load_pd(p);
- }
- static realvec_t loadu(real_t const* p)
- {
- return _mm_loadu_pd(p);
- }
- static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff);
- return loadu(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return m.m.ifthen(loada(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return m.m.ifthen(loadu(p), *this);
- }
+ return r;
+ }
+ intvec_t rotate(intvec_t n) const;
+ intvec_t operator>>(intvec_t n) const {
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] >> n[i]);
}
- realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff, m);
- return loadu(p+ioff, m);
+ return r;
+ }
+ intvec_t operator<<(intvec_t n) const {
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] << n[i]);
}
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- _mm_store_pd(p, v);
+ return r;
+ }
+ intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+ intvec_t clz() const;
+ intvec_t popcount() const;
+
+ boolvec_t operator==(intvec_t const &x) const { return !(*this != x); }
+ boolvec_t operator!=(intvec_t const &x) const {
+ return (*this ^ x).convert_bool();
+ }
+ boolvec_t operator<(intvec_t const &x) const {
+ // return (*this - x).as_bool();
+ boolvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] < x[i]);
}
- void storeu(real_t* p) const
- {
- return _mm_storeu_pd(p, v);
+ return r;
+ }
+ boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+ boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+ boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+ intvec_t abs() const;
+ boolvec_t isignbit() const { return as_bool(); }
+ intvec_t max(intvec_t x) const;
+ intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 2> : floatprops<double> {
+ static int const size = 2;
+ typedef real_t scalar_t;
+ typedef __m128d vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static char const *name() { return "<SSE2:2*double>"; }
+ void barrier() { __asm__("" : "+x"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x) : v(x) {}
+ realvec(real_t a) : v(_mm_set1_pd(a)) {}
+ realvec(real_t const *as) : v(_mm_set_pd(as[1], as[0])) {}
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const {
+ return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+ }
+ realvec_t &set_elt(int n, real_t a) {
+ return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+ }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return _mm_load_pd(p);
+ }
+ static realvec_t loadu(real_t const *p) { return _mm_loadu_pd(p); }
+ static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
}
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff);
- storeu(p+ioff);
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
}
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- } else {
+ }
+ realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff, m);
+ return loadu(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ _mm_store_pd(p, v);
+ }
+ void storeu(real_t *p) const { return _mm_storeu_pd(p, v); }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
#if defined __AVX__
- _mm_maskstore_pd(p, m.m.as_int(), v);
+ _mm_maskstore_pd(p, m.m.as_int(), v);
#else
- if (m.m[0]) _mm_storel_pd(p , v);
- else if (m.m[1]) _mm_storeh_pd(p+1, v);
+ if (m.m[0])
+ _mm_storel_pd(p, v);
+ else if (m.m[1])
+ _mm_storeh_pd(p + 1, v);
#endif
- }
- }
- void storeu(real_t* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- } else {
- if (m.m[0]) _mm_storel_pd(p , v);
- else if (m.m[1]) _mm_storeh_pd(p+1, v);
- }
- }
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff, m);
- storeu(p+ioff, m);
}
-
-
-
- intvec_t as_int() const { return _mm_castpd_si128(v); }
- intvec_t convert_int() const
- {
- intvec_t r;
- r.set_elt(0, floatprops::convert_int((*this)[0]));
- r.set_elt(1, floatprops::convert_int((*this)[1]));
- return r;
- }
-
-
-
- realvec_t operator+() const { return *this; }
- realvec_t operator-() const { return RV(0.0) - *this; }
-
- realvec_t operator+(realvec_t x) const { return _mm_add_pd(v, x.v); }
- realvec_t operator-(realvec_t x) const { return _mm_sub_pd(v, x.v); }
- realvec_t operator*(realvec_t x) const { return _mm_mul_pd(v, x.v); }
- realvec_t operator/(realvec_t x) const { return _mm_div_pd(v, x.v); }
-
- realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
- realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
- realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
- realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-
- real_t maxval() const
- {
- return vml_std::fmax((*this)[0], (*this)[1]);
- }
- real_t minval() const
- {
- return vml_std::fmin((*this)[0], (*this)[1]);
- }
- real_t prod() const
- {
- return (*this)[0] * (*this)[1];
+ }
+ void storeu(real_t *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ if (m.m[0])
+ _mm_storel_pd(p, v);
+ else if (m.m[1])
+ _mm_storeh_pd(p + 1, v);
}
- real_t sum() const
- {
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff, m);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return _mm_castpd_si128(v); }
+ intvec_t convert_int() const {
+ intvec_t r;
+ r.set_elt(0, floatprops::convert_int((*this)[0]));
+ r.set_elt(1, floatprops::convert_int((*this)[1]));
+ return r;
+ }
+
+ realvec_t operator+() const { return *this; }
+ realvec_t operator-() const { return RV(0.0) - *this; }
+
+ realvec_t operator+(realvec_t x) const { return _mm_add_pd(v, x.v); }
+ realvec_t operator-(realvec_t x) const { return _mm_sub_pd(v, x.v); }
+ realvec_t operator*(realvec_t x) const { return _mm_mul_pd(v, x.v); }
+ realvec_t operator/(realvec_t x) const { return _mm_div_pd(v, x.v); }
+
+ realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+ realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+ realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+ realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+ real_t maxval() const { return vml_std::fmax((*this)[0], (*this)[1]); }
+ real_t minval() const { return vml_std::fmin((*this)[0], (*this)[1]); }
+ real_t prod() const { return (*this)[0] * (*this)[1]; }
+ real_t sum() const {
#ifdef __SSE3__
- return _mm_cvtsd_f64(_mm_hadd_pd(v, v));
+ return _mm_cvtsd_f64(_mm_hadd_pd(v, v));
#else
- return (*this)[0] + (*this)[1];
+ return (*this)[0] + (*this)[1];
#endif
- }
-
-
-
- boolvec_t operator==(realvec_t const& x) const
- {
- return _mm_cmpeq_pd(v, x.v);
- }
- boolvec_t operator!=(realvec_t const& x) const
- {
- return _mm_cmpneq_pd(v, x.v);
- }
- boolvec_t operator<(realvec_t const& x) const
- {
- return _mm_cmplt_pd(v, x.v);
- }
- boolvec_t operator<=(realvec_t const& x) const
- {
- return _mm_cmple_pd(v, x.v);
- }
- boolvec_t operator>(realvec_t const& x) const
- {
- return _mm_cmpgt_pd(v, x.v);
- }
- boolvec_t operator>=(realvec_t const& x) const
- {
- return _mm_cmpge_pd(v, x.v);
- }
-
-
-
- realvec_t acos() const { return MF::vml_acos(*this); }
- realvec_t acosh() const { return MF::vml_acosh(*this); }
- realvec_t asin() const { return MF::vml_asin(*this); }
- realvec_t asinh() const { return MF::vml_asinh(*this); }
- realvec_t atan() const { return MF::vml_atan(*this); }
- realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
- realvec_t atanh() const { return MF::vml_atanh(*this); }
- realvec_t cbrt() const { return MF::vml_cbrt(*this); }
- realvec_t ceil() const
- {
+ }
+
+ boolvec_t operator==(realvec_t const &x) const {
+ return _mm_cmpeq_pd(v, x.v);
+ }
+ boolvec_t operator!=(realvec_t const &x) const {
+ return _mm_cmpneq_pd(v, x.v);
+ }
+ boolvec_t operator<(realvec_t const &x) const { return _mm_cmplt_pd(v, x.v); }
+ boolvec_t operator<=(realvec_t const &x) const {
+ return _mm_cmple_pd(v, x.v);
+ }
+ boolvec_t operator>(realvec_t const &x) const { return _mm_cmpgt_pd(v, x.v); }
+ boolvec_t operator>=(realvec_t const &x) const {
+ return _mm_cmpge_pd(v, x.v);
+ }
+
+ realvec_t acos() const { return MF::vml_acos(*this); }
+ realvec_t acosh() const { return MF::vml_acosh(*this); }
+ realvec_t asin() const { return MF::vml_asin(*this); }
+ realvec_t asinh() const { return MF::vml_asinh(*this); }
+ realvec_t atan() const { return MF::vml_atan(*this); }
+ realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+ realvec_t atanh() const { return MF::vml_atanh(*this); }
+ realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+ realvec_t ceil() const {
#ifdef __SSE4_1__
- return _mm_ceil_pd(v);
+ return _mm_ceil_pd(v);
#else
- return MF::vml_ceil(*this);
+ return MF::vml_ceil(*this);
#endif
- }
- realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
- realvec_t cos() const { return MF::vml_cos(*this); }
- realvec_t cosh() const { return MF::vml_cosh(*this); }
- realvec_t exp() const { return MF::vml_exp(*this); }
- realvec_t exp10() const { return MF::vml_exp10(*this); }
- realvec_t exp2() const { return MF::vml_exp2(*this); }
- realvec_t expm1() const { return MF::vml_expm1(*this); }
- realvec_t fabs() const { return MF::vml_fabs(*this); }
- realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
- realvec_t floor() const
- {
+ }
+ realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+ realvec_t cos() const { return MF::vml_cos(*this); }
+ realvec_t cosh() const { return MF::vml_cosh(*this); }
+ realvec_t exp() const { return MF::vml_exp(*this); }
+ realvec_t exp10() const { return MF::vml_exp10(*this); }
+ realvec_t exp2() const { return MF::vml_exp2(*this); }
+ realvec_t expm1() const { return MF::vml_expm1(*this); }
+ realvec_t fabs() const { return MF::vml_fabs(*this); }
+ realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+ realvec_t floor() const {
#ifdef __SSE4_1__
- return _mm_floor_pd(v);
+ return _mm_floor_pd(v);
#else
- return MF::vml_floor(*this);
+ return MF::vml_floor(*this);
#endif
- }
- realvec_t fma(realvec_t y, realvec_t z) const
- {
- return MF::vml_fma(*this, y, z);
- }
- realvec_t fmax(realvec_t y) const { return _mm_max_pd(v, y.v); }
- realvec_t fmin(realvec_t y) const { return _mm_min_pd(v, y.v); }
- realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
- realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
- realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
- intvec_t ilogb() const { return MF::vml_ilogb(*this); }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const
- {
+ }
+ realvec_t fma(realvec_t y, realvec_t z) const {
+ return MF::vml_fma(*this, y, z);
+ }
+ realvec_t fmax(realvec_t y) const { return _mm_max_pd(v, y.v); }
+ realvec_t fmin(realvec_t y) const { return _mm_min_pd(v, y.v); }
+ realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+ realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+ realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const {
#ifdef VML_HAVE_NAN
- return _mm_cmpunord_pd(v, v);
+ return _mm_cmpunord_pd(v, v);
#else
- return BV(false);
+ return BV(false);
#endif
- }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
- realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
- realvec_t log() const { return MF::vml_log(*this); }
- realvec_t log10() const { return MF::vml_log10(*this); }
- realvec_t log1p() const { return MF::vml_log1p(*this); }
- realvec_t log2() const { return MF::vml_log2(*this); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return MF::vml_mad(*this, y, z);
- }
- realvec_t nextafter(realvec_t y) const
- {
- return MF::vml_nextafter(*this, y);
- }
- realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
- realvec_t rcp() const { return _mm_div_pd(_mm_set1_pd(1.0), v); }
- realvec_t remainder(realvec_t y) const
- {
- return MF::vml_remainder(*this, y);
- }
- realvec_t rint() const
- {
+ }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec_t log() const { return MF::vml_log(*this); }
+ realvec_t log10() const { return MF::vml_log10(*this); }
+ realvec_t log1p() const { return MF::vml_log1p(*this); }
+ realvec_t log2() const { return MF::vml_log2(*this); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return MF::vml_mad(*this, y, z);
+ }
+ realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+ realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+ realvec_t rcp() const { return _mm_div_pd(_mm_set1_pd(1.0), v); }
+ realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+ realvec_t rint() const {
#ifdef __SSE4_1__
- return _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
+ return _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT);
#else
- return MF::vml_rint(*this);
+ return MF::vml_rint(*this);
#endif
- }
- realvec_t round() const { return MF::vml_round(*this); }
- realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
- boolvec_t signbit() const { return v; }
- realvec_t sin() const { return MF::vml_sin(*this); }
- realvec_t sinh() const { return MF::vml_sinh(*this); }
- realvec_t sqrt() const { return _mm_sqrt_pd(v); }
- realvec_t tan() const { return MF::vml_tan(*this); }
- realvec_t tanh() const { return MF::vml_tanh(*this); }
- realvec_t trunc() const
- {
+ }
+ realvec_t round() const { return MF::vml_round(*this); }
+ realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+ boolvec_t signbit() const { return v; }
+ realvec_t sin() const { return MF::vml_sin(*this); }
+ realvec_t sinh() const { return MF::vml_sinh(*this); }
+ realvec_t sqrt() const { return _mm_sqrt_pd(v); }
+ realvec_t tan() const { return MF::vml_tan(*this); }
+ realvec_t tanh() const { return MF::vml_tanh(*this); }
+ realvec_t trunc() const {
#ifdef __SSE4_1__
- return _mm_round_pd(v, _MM_FROUND_TO_ZERO);
+ return _mm_round_pd(v, _MM_FROUND_TO_ZERO);
#else
- return MF::vml_trunc(*this);
+ return MF::vml_trunc(*this);
#endif
- }
- };
-
-
-
- // boolvec definitions
-
- inline intvec<double,2> boolvec<double,2>::as_int() const
- {
- return _mm_castpd_si128(v);
- }
-
- inline intvec<double,2> boolvec<double,2>::convert_int() const
- {
- //return ifthen(v, U(1), U(0));
- return lsr(as_int(), bits-1);
- }
-
- inline
- boolvec<double,2> boolvec<double,2>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return ifthen(x.as_int(), y.as_int()).as_bool();
- }
-
- inline
- intvec<double,2> boolvec<double,2>::ifthen(intvec_t x, intvec_t y) const
- {
- return ifthen(x.as_float(), y.as_float()).as_int();
- }
-
- inline
- realvec<double,2> boolvec<double,2>::ifthen(realvec_t x, realvec_t y) const
- {
+ }
+};
+
+// boolvec definitions
+
+inline intvec<double, 2> boolvec<double, 2>::as_int() const {
+ return _mm_castpd_si128(v);
+}
+
+inline intvec<double, 2> boolvec<double, 2>::convert_int() const {
+ // return ifthen(v, U(1), U(0));
+ return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<double, 2> boolvec<double, 2>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<double, 2> boolvec<double, 2>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<double, 2> boolvec<double, 2>::ifthen(realvec_t x,
+ realvec_t y) const {
#ifdef __SSE4_1__
- return _mm_blendv_pd(y.v, x.v, v);
+ return _mm_blendv_pd(y.v, x.v, v);
#else
- return (( -convert_int() & x.as_int()) |
- (~-convert_int() & y.as_int())).as_float();
+ return ((-convert_int() & x.as_int()) | (~ - convert_int() & y.as_int()))
+ .as_float();
#endif
- }
-
-
-
- // intvec definitions
-
- inline realvec<double,2> intvec<double,2>::as_float() const
- {
- return _mm_castsi128_pd(v);
- }
-
- inline realvec<double,2> intvec<double,2>::convert_float() const
- {
- realvec_t r;
- r.set_elt(0, floatprops::convert_float((*this)[0]));
- r.set_elt(1, floatprops::convert_float((*this)[1]));
- return r;
- }
-
- inline intvec<double,2> intvec<double,2>::abs() const
- {
- return MF::vml_abs(*this);
- }
-
- inline intvec<double,2> intvec<double,2>::bitifthen(intvec_t x,
- intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
- inline intvec<double,2> intvec<double,2>::clz() const
- {
- return MF::vml_clz(*this);
- }
-
- inline intvec<double,2> intvec<double,2>::max(intvec_t x) const
- {
- return MF::vml_max(*this, x);
- }
-
- inline intvec<double,2> intvec<double,2>::min(intvec_t x) const
- {
- return MF::vml_min(*this, x);
- }
-
- inline intvec<double,2> intvec<double,2>::popcount() const
- {
- return MF::vml_popcount(*this);
- }
-
- inline intvec<double,2> intvec<double,2>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
- inline intvec<double,2> intvec<double,2>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
+}
+
+// intvec definitions
+
+inline realvec<double, 2> intvec<double, 2>::as_float() const {
+ return _mm_castsi128_pd(v);
+}
+
+inline realvec<double, 2> intvec<double, 2>::convert_float() const {
+ realvec_t r;
+ r.set_elt(0, floatprops::convert_float((*this)[0]));
+ r.set_elt(1, floatprops::convert_float((*this)[1]));
+ return r;
+}
+
+inline intvec<double, 2> intvec<double, 2>::abs() const {
+ return MF::vml_abs(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::bitifthen(intvec_t x,
+ intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 2> intvec<double, 2>::clz() const {
+ return MF::vml_clz(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::max(intvec_t x) const {
+ return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 2> intvec<double, 2>::min(intvec_t x) const {
+ return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 2> intvec<double, 2>::popcount() const {
+ return MF::vml_popcount(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 2> intvec<double, 2>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_SSE_DOUBLE2_H
+#endif // #ifndef VEC_SSE_DOUBLE2_H
diff --git a/vec_sse_float1.h b/vec_sse_float1.h
index 9cee891..a84a046 100644
--- a/vec_sse_float1.h
+++ b/vec_sse_float1.h
@@ -12,583 +12,489 @@
// SSE2 intrinsics
#include <emmintrin.h>
-#ifdef __SSE3__ // Intel's SSE 3
-# include <pmmintrin.h>
+#ifdef __SSE3__ // Intel's SSE 3
+#include <pmmintrin.h>
#endif
-#ifdef __SSE4_1__ // Intel's SSE 4.1
-# include <smmintrin.h>
+#ifdef __SSE4_1__ // Intel's SSE 4.1
+#include <smmintrin.h>
#endif
-#ifdef __SSE4A__ // AMD's SSE 4a
-# include <ammintrin.h>
+#ifdef __SSE4A__ // AMD's SSE 4a
+#include <ammintrin.h>
#endif
-#if defined __AVX__ // Intel's AVX
-# include <immintrin.h>
+#if defined __AVX__ // Intel's AVX
+#include <immintrin.h>
#endif
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_FLOAT_1
- template<> struct boolvec<float,1>;
- template<> struct intvec<float,1>;
- template<> struct realvec<float,1>;
-
-
-
- template<>
- struct boolvec<float,1>: floatprops<float>
- {
- static int const size = 1;
- typedef bool scalar_t;
- typedef uint_t bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- // true values are non-zero, false values are zero
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(boolvec const& x): v(x.v) {}
- // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a): v(a) {}
- boolvec(bool const* as): v(as[0]) {}
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const { return v; }
- boolvec_t& set_elt(int n, bool a) { return v=a, *this; }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec_t operator!() const { return !v; }
-
- boolvec_t operator&&(boolvec_t x) const { return v && x.v; }
- boolvec_t operator||(boolvec_t x) const { return v || x.v; }
- boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); }
- boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); }
-
- bool all() const { return *this; }
- bool any() const { return *this; }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<float,1>: floatprops<float>
- {
- static int const size = 1;
- typedef int_t scalar_t;
- typedef int_t ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(intvec const& x): v(x.v) {}
- // intvec& operator=(intvec const& x) { return v=x.v, *this; }
- intvec(int_t a): v(a) {}
- intvec(int_t const* as): v(as[0]) {}
- static intvec_t iota() { return intvec(I(0)); }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const { return v; }
- intvec_t& set_elt(int n, int_t a) { return v=a, *this; }
-
-
-
- boolvec_t as_bool() const { return U(v); }
- boolvec_t convert_bool() const { return bool(v); }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- intvec_t operator+() const { return +v; }
- intvec_t operator-() const { return -v; }
-
- intvec_t operator+(intvec_t x) const { return v+x.v; }
- intvec_t operator-(intvec_t x) const { return v-x.v; }
- intvec_t operator*(intvec_t x) const { return v*x.v; }
- intvec_t operator/(intvec_t x) const { return v/x.v; }
- intvec_t operator%(intvec_t x) const { return v%x.v; }
-
- intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
- intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
- intvec_t& operator*=(intvec_t const& x) { return *this=*this*x; }
- intvec_t& operator/=(intvec_t const& x) { return *this=*this/x; }
- intvec_t& operator%=(intvec_t const& x) { return *this=*this%x; }
-
-
-
- intvec_t operator~() const { return ~v; }
-
- intvec_t operator&(intvec_t x) const { return v&x.v; }
- intvec_t operator|(intvec_t x) const { return v|x.v; }
- intvec_t operator^(intvec_t x) const { return v^x.v; }
-
- intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
- intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
- intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const;
-
-
-
- intvec_t lsr(int_t n) const { return U(v) >> U(n); }
- intvec_t rotate(int_t n) const;
- intvec_t operator>>(int_t n) const { return v>>n; }
- intvec_t operator<<(int_t n) const { return v<<n; }
-
- intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec_t lsr(intvec_t n) const { return U(v) >> U(n); }
- intvec_t rotate(intvec_t n) const;
- intvec_t operator>>(intvec_t n) const { return v>>n; }
- intvec_t operator<<(intvec_t n) const { return v<<n; }
-
- intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-
- intvec_t clz() const { return __builtin_clz(v); }
- intvec_t popcount() const { return __builtin_popcount(v); }
-
-
-
- boolvec_t operator==(intvec_t const& x) const { return v==x.v; }
- boolvec_t operator!=(intvec_t const& x) const { return v!=x.v; }
- boolvec_t operator<(intvec_t const& x) const { return v<x.v; }
- boolvec_t operator<=(intvec_t const& x) const { return v<=x.v; }
- boolvec_t operator>(intvec_t const& x) const { return v>x.v; }
- boolvec_t operator>=(intvec_t const& x) const { return v>=x.v; }
-
- intvec_t abs() const { return std::abs(v); }
- boolvec_t isignbit() const { return v<0; }
- intvec_t max(intvec_t x) const { return std::max(v, x.v); }
- intvec_t min(intvec_t x) const { return std::min(v, x.v); }
- };
-
-
-
- template<>
- struct realvec<float,1>: floatprops<float>
- {
- static int const size = 1;
- typedef real_t scalar_t;
- typedef float vector_t;
- static int const alignment = sizeof(vector_t);
-
- static char const* name() { return "<SSE2:1*float>"; }
- void barrier() { __asm__("": "+x"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- private:
- static __m128 from_float(float a) { return _mm_set_ss(a); }
- static float to_float(__m128 a) { return _mm_cvtss_f32(a); }
- public:
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(realvec const& x): v(x.v) {}
- // realvec& operator=(realvec const& x) { return v=x.v, *this; }
- realvec(real_t a): v(a) {}
- realvec(real_t const* as): v(as[0]) {}
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const { return v; }
- realvec_t& set_elt(int n, real_t a) { return v=a, *this; }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return *p;
- }
- static realvec_t loadu(real_t const* p)
- {
- return *p;
- }
- static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return loada(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return *this;
- }
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return *this;
- }
- }
- realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return loada(p+ioff, m);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- *p = v;
- }
- void storeu(real_t* p) const
- {
- *p = v;
- }
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storea(p+ioff);
- }
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- }
+template <> struct boolvec<float, 1>;
+template <> struct intvec<float, 1>;
+template <> struct realvec<float, 1>;
+
+template <> struct boolvec<float, 1> : floatprops<float> {
+ static int const size = 1;
+ typedef bool scalar_t;
+ typedef uint_t bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+ // true values are non-zero, false values are zero
+
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v(a) {}
+ boolvec(bool const *as) : v(as[0]) {}
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const { return v; }
+ boolvec_t &set_elt(int n, bool a) { return v = a, *this; }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec_t operator!() const { return !v; }
+
+ boolvec_t operator&&(boolvec_t x) const { return v && x.v; }
+ boolvec_t operator||(boolvec_t x) const { return v || x.v; }
+ boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); }
+ boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); }
+
+ bool all() const { return *this; }
+ bool any() const { return *this; }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 1> : floatprops<float> {
+ static int const size = 1;
+ typedef int_t scalar_t;
+ typedef int_t ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(int_t a) : v(a) {}
+ intvec(int_t const *as) : v(as[0]) {}
+ static intvec_t iota() { return intvec(I(0)); }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const { return v; }
+ intvec_t &set_elt(int n, int_t a) { return v = a, *this; }
+
+ boolvec_t as_bool() const { return U(v); }
+ boolvec_t convert_bool() const { return bool(v); }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ intvec_t operator+() const { return +v; }
+ intvec_t operator-() const { return -v; }
+
+ intvec_t operator+(intvec_t x) const { return v + x.v; }
+ intvec_t operator-(intvec_t x) const { return v - x.v; }
+ intvec_t operator*(intvec_t x) const { return v * x.v; }
+ intvec_t operator/(intvec_t x) const { return v / x.v; }
+ intvec_t operator%(intvec_t x) const { return v % x.v; }
+
+ intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+ intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+ intvec_t &operator*=(intvec_t const &x) { return *this = *this * x; }
+ intvec_t &operator/=(intvec_t const &x) { return *this = *this / x; }
+ intvec_t &operator%=(intvec_t const &x) { return *this = *this % x; }
+
+ intvec_t operator~() const { return ~v; }
+
+ intvec_t operator&(intvec_t x) const { return v & x.v; }
+ intvec_t operator|(intvec_t x) const { return v | x.v; }
+ intvec_t operator^(intvec_t x) const { return v ^ x.v; }
+
+ intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+ intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+ intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+ intvec_t lsr(int_t n) const { return U(v) >> U(n); }
+ intvec_t rotate(int_t n) const;
+ intvec_t operator>>(int_t n) const { return v >> n; }
+ intvec_t operator<<(int_t n) const { return v << n; }
+
+ intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec_t lsr(intvec_t n) const { return U(v) >> U(n); }
+ intvec_t rotate(intvec_t n) const;
+ intvec_t operator>>(intvec_t n) const { return v >> n; }
+ intvec_t operator<<(intvec_t n) const { return v << n; }
+
+ intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+ intvec_t clz() const { return __builtin_clz(v); }
+ intvec_t popcount() const { return __builtin_popcount(v); }
+
+ boolvec_t operator==(intvec_t const &x) const { return v == x.v; }
+ boolvec_t operator!=(intvec_t const &x) const { return v != x.v; }
+ boolvec_t operator<(intvec_t const &x) const { return v < x.v; }
+ boolvec_t operator<=(intvec_t const &x) const { return v <= x.v; }
+ boolvec_t operator>(intvec_t const &x) const { return v > x.v; }
+ boolvec_t operator>=(intvec_t const &x) const { return v >= x.v; }
+
+ intvec_t abs() const { return std::abs(v); }
+ boolvec_t isignbit() const { return v < 0; }
+ intvec_t max(intvec_t x) const { return std::max(v, x.v); }
+ intvec_t min(intvec_t x) const { return std::min(v, x.v); }
+};
+
+template <> struct realvec<float, 1> : floatprops<float> {
+ static int const size = 1;
+ typedef real_t scalar_t;
+ typedef float vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static char const *name() { return "<SSE2:1*float>"; }
+ void barrier() { __asm__("" : "+x"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+private:
+ static __m128 from_float(float a) { return _mm_set_ss(a); }
+ static float to_float(__m128 a) { return _mm_cvtss_f32(a); }
+
+public:
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(real_t a) : v(a) {}
+ realvec(real_t const *as) : v(as[0]) {}
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const { return v; }
+ realvec_t &set_elt(int n, real_t a) { return v = a, *this; }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return *p;
+ }
+ static realvec_t loadu(real_t const *p) { return *p; }
+ static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return loada(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return *this;
}
- void storeu(real_t* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- }
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return *this;
}
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storea(p+ioff, m);
+ }
+ realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return loada(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ *p = v;
+ }
+ void storeu(real_t *p) const { *p = v; }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storea(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
}
-
-
-
- intvec_t as_int() const { return floatprops::as_int(v); }
- intvec_t convert_int() const {
- // return floatprops::convert_int(v);
- return _mm_cvttss_si32(_mm_set_ss(v));
+ }
+ void storeu(real_t *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
}
-
-
-
- realvec_t operator+() const { return +v; }
- realvec_t operator-() const { return -v; }
-
- realvec_t operator+(realvec_t x) const { return v+x.v; }
- realvec_t operator-(realvec_t x) const { return v-x.v; }
- realvec_t operator*(realvec_t x) const { return v*x.v; }
- realvec_t operator/(realvec_t x) const { return v/x.v; }
-
- realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
- realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
- realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
- realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-
- real_t maxval() const { return *this; }
- real_t minval() const { return *this; }
- real_t prod() const { return *this; }
- real_t sum() const { return *this; }
-
-
-
- boolvec_t operator==(realvec_t const& x) const { return v==x.v; }
- boolvec_t operator!=(realvec_t const& x) const { return v!=x.v; }
- boolvec_t operator<(realvec_t const& x) const { return v<x.v; }
- boolvec_t operator<=(realvec_t const& x) const { return v<=x.v; }
- boolvec_t operator>(realvec_t const& x) const { return v>x.v; }
- boolvec_t operator>=(realvec_t const& x) const { return v>=x.v; }
-
-
-
- realvec_t acos() const { return MF::vml_acos(*this); }
- realvec_t acosh() const { return MF::vml_acosh(*this); }
- realvec_t asin() const { return MF::vml_asin(*this); }
- realvec_t asinh() const { return MF::vml_asinh(*this); }
- realvec_t atan() const { return MF::vml_atan(*this); }
- realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
- realvec_t atanh() const { return MF::vml_atanh(*this); }
- realvec_t cbrt() const { return MF::vml_cbrt(*this); }
- realvec_t ceil() const
- {
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storea(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return floatprops::as_int(v); }
+ intvec_t convert_int() const {
+ // return floatprops::convert_int(v);
+ return _mm_cvttss_si32(_mm_set_ss(v));
+ }
+
+ realvec_t operator+() const { return +v; }
+ realvec_t operator-() const { return -v; }
+
+ realvec_t operator+(realvec_t x) const { return v + x.v; }
+ realvec_t operator-(realvec_t x) const { return v - x.v; }
+ realvec_t operator*(realvec_t x) const { return v * x.v; }
+ realvec_t operator/(realvec_t x) const { return v / x.v; }
+
+ realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+ realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+ realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+ realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+ real_t maxval() const { return *this; }
+ real_t minval() const { return *this; }
+ real_t prod() const { return *this; }
+ real_t sum() const { return *this; }
+
+ boolvec_t operator==(realvec_t const &x) const { return v == x.v; }
+ boolvec_t operator!=(realvec_t const &x) const { return v != x.v; }
+ boolvec_t operator<(realvec_t const &x) const { return v < x.v; }
+ boolvec_t operator<=(realvec_t const &x) const { return v <= x.v; }
+ boolvec_t operator>(realvec_t const &x) const { return v > x.v; }
+ boolvec_t operator>=(realvec_t const &x) const { return v >= x.v; }
+
+ realvec_t acos() const { return MF::vml_acos(*this); }
+ realvec_t acosh() const { return MF::vml_acosh(*this); }
+ realvec_t asin() const { return MF::vml_asin(*this); }
+ realvec_t asinh() const { return MF::vml_asinh(*this); }
+ realvec_t atan() const { return MF::vml_atan(*this); }
+ realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+ realvec_t atanh() const { return MF::vml_atanh(*this); }
+ realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+ realvec_t ceil() const {
#ifdef __SSE4_1__
- return to_float(_mm_ceil_ss(from_float(v), from_float(v)));
+ return to_float(_mm_ceil_ss(from_float(v), from_float(v)));
#else
- return vml_std::ceil(v);
+ return vml_std::ceil(v);
#endif
- }
- realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); }
- realvec_t cos() const { return MF::vml_cos(*this); }
- realvec_t cosh() const { return MF::vml_cosh(*this); }
- realvec_t exp() const { return MF::vml_exp(*this); }
- realvec_t exp10() const { return MF::vml_exp10(*this); }
- realvec_t exp2() const { return MF::vml_exp2(*this); }
- realvec_t expm1() const { return MF::vml_expm1(*this); }
- realvec_t fabs() const { return vml_std::fabs(v); }
- realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
- realvec_t floor() const
- {
+ }
+ realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); }
+ realvec_t cos() const { return MF::vml_cos(*this); }
+ realvec_t cosh() const { return MF::vml_cosh(*this); }
+ realvec_t exp() const { return MF::vml_exp(*this); }
+ realvec_t exp10() const { return MF::vml_exp10(*this); }
+ realvec_t exp2() const { return MF::vml_exp2(*this); }
+ realvec_t expm1() const { return MF::vml_expm1(*this); }
+ realvec_t fabs() const { return vml_std::fabs(v); }
+ realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+ realvec_t floor() const {
#ifdef __SSE4_1__
- return to_float(_mm_floor_ss(from_float(v), from_float(v)));
+ return to_float(_mm_floor_ss(from_float(v), from_float(v)));
#else
- return vml_std::floor(v);
+ return vml_std::floor(v);
#endif
- }
- realvec_t fma(realvec_t y, realvec_t z) const
- {
- return MF::vml_fma(*this, y, z);
- }
- realvec_t fmax(realvec_t y) const
- {
- return to_float(_mm_max_ss(from_float(v), from_float(y.v)));
- }
- realvec_t fmin(realvec_t y) const
- {
- return to_float(_mm_min_ss(from_float(v), from_float(y.v)));
- }
- realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
- realvec_t frexp(intvec_t* irp) const
- {
- int iri;
- realvec_t r = vml_std::frexp(v, &iri);
- int_t ir = iri;
- if (isinf()) ir = std::numeric_limits<int_t>::max();
- if (isnan()) ir = std::numeric_limits<int_t>::min();
- irp->v = ir;
- return r;
- }
- realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
- intvec_t ilogb() const
- {
- int_t r = vml_std::ilogb(v);
- typedef std::numeric_limits<int_t> NL;
- if (FP_ILOGB0 != NL::min() and *this == RV(R(0.0))) {
- r = NL::min();
+ }
+ realvec_t fma(realvec_t y, realvec_t z) const {
+ return MF::vml_fma(*this, y, z);
+ }
+ realvec_t fmax(realvec_t y) const {
+ return to_float(_mm_max_ss(from_float(v), from_float(y.v)));
+ }
+ realvec_t fmin(realvec_t y) const {
+ return to_float(_mm_min_ss(from_float(v), from_float(y.v)));
+ }
+ realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
+ realvec_t frexp(intvec_t *irp) const {
+ int iri;
+ realvec_t r = vml_std::frexp(v, &iri);
+ int_t ir = iri;
+ if (isinf())
+ ir = std::numeric_limits<int_t>::max();
+ if (isnan())
+ ir = std::numeric_limits<int_t>::min();
+ irp->v = ir;
+ return r;
+ }
+ realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+ intvec_t ilogb() const {
+ int_t r = vml_std::ilogb(v);
+ typedef std::numeric_limits<int_t> NL;
+ if (FP_ILOGB0 != NL::min() and *this == RV(R(0.0))) {
+ r = NL::min();
#if defined VML_HAVE_INF
- } else if (INT_MAX != NL::max() and vml_std::isinf(v)) {
- r = NL::max();
+ } else if (INT_MAX != NL::max() and vml_std::isinf(v)) {
+ r = NL::max();
#endif
#if defined VML_HAVE_NAN
- } else if (FP_ILOGBNAN != NL::min() and isnan()) {
- r = NL::min();
+ } else if (FP_ILOGBNAN != NL::min() and isnan()) {
+ r = NL::min();
#endif
- }
- return r;
}
- boolvec_t isfinite() const { return vml_std::isfinite(v); }
- boolvec_t isinf() const { return vml_std::isinf(v); }
- boolvec_t isnan() const
- {
+ return r;
+ }
+ boolvec_t isfinite() const { return vml_std::isfinite(v); }
+ boolvec_t isinf() const { return vml_std::isinf(v); }
+ boolvec_t isnan() const {
#if defined VML_HAVE_NAN
- // This is wrong:
- // return _mm_ucomineq_ss(from_float(v), from_float(v));
- // This works:
- // char r;
- // __asm__("ucomiss %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v));
- // return boolvec_t::scalar_t(r);
- // This works as well:
- return vml_std::isnan(v);
+ // This is wrong:
+ // return _mm_ucomineq_ss(from_float(v), from_float(v));
+ // This works:
+ // char r;
+ // __asm__("ucomiss %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v));
+ // return boolvec_t::scalar_t(r);
+ // This works as well:
+ return vml_std::isnan(v);
#else
- return BV(false);
+ return BV(false);
#endif
- }
- boolvec_t isnormal() const { return vml_std::isnormal(v); }
- realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); }
- realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); }
- realvec_t log() const { return MF::vml_log(*this); }
- realvec_t log10() const { return MF::vml_log10(*this); }
- realvec_t log1p() const { return MF::vml_log1p(*this); }
- realvec_t log2() const { return MF::vml_log2(*this); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return MF::vml_mad(*this, y, z);
- }
- realvec_t nextafter(realvec_t y) const
- {
- return MF::vml_nextafter(*this, y);
- }
- realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
- realvec_t rcp() const { return R(1.0)/v; }
- realvec_t remainder(realvec_t y) const
- {
- return vml_std::remainder(v, y.v);
- }
- realvec_t rint() const
- {
+ }
+ boolvec_t isnormal() const { return vml_std::isnormal(v); }
+ realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); }
+ realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); }
+ realvec_t log() const { return MF::vml_log(*this); }
+ realvec_t log10() const { return MF::vml_log10(*this); }
+ realvec_t log1p() const { return MF::vml_log1p(*this); }
+ realvec_t log2() const { return MF::vml_log2(*this); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return MF::vml_mad(*this, y, z);
+ }
+ realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+ realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+ realvec_t rcp() const { return R(1.0) / v; }
+ realvec_t remainder(realvec_t y) const { return vml_std::remainder(v, y.v); }
+ realvec_t rint() const {
#ifdef __SSE4_1__
- return to_float(_mm_round_ss(from_float(v), from_float(v),
- _MM_FROUND_TO_NEAREST_INT));
+ return to_float(
+ _mm_round_ss(from_float(v), from_float(v), _MM_FROUND_TO_NEAREST_INT));
#else
- return MF::vml_rint(*this);
+ return MF::vml_rint(*this);
#endif
- }
- realvec_t round() const { return MF::vml_round(*this); }
- realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
- boolvec_t signbit() const { return vml_std::signbit(v); }
- realvec_t sin() const { return MF::vml_sin(*this); }
- realvec_t sinh() const { return MF::vml_sinh(*this); }
- realvec_t sqrt() const { return to_float(_mm_sqrt_ss(from_float(v))); }
- realvec_t tan() const { return MF::vml_tan(*this); }
- realvec_t tanh() const { return MF::vml_tanh(*this); }
- realvec_t trunc() const
- {
+ }
+ realvec_t round() const { return MF::vml_round(*this); }
+ realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+ boolvec_t signbit() const { return vml_std::signbit(v); }
+ realvec_t sin() const { return MF::vml_sin(*this); }
+ realvec_t sinh() const { return MF::vml_sinh(*this); }
+ realvec_t sqrt() const { return to_float(_mm_sqrt_ss(from_float(v))); }
+ realvec_t tan() const { return MF::vml_tan(*this); }
+ realvec_t tanh() const { return MF::vml_tanh(*this); }
+ realvec_t trunc() const {
#ifdef __SSE4_1__
- return to_float(_mm_round_ss(from_float(v), from_float(v),
- _MM_FROUND_TO_ZERO));
+ return to_float(
+ _mm_round_ss(from_float(v), from_float(v), _MM_FROUND_TO_ZERO));
#else
- return MF::vml_trunc(*this);
+ return MF::vml_trunc(*this);
#endif
- }
- };
-
-
-
- // boolvec definitions
-
- inline intvec<float,1> boolvec<float,1>::as_int() const
- {
- return I(v);
- }
-
- inline intvec<float,1> boolvec<float,1>::convert_int() const
- {
- return v;
- }
-
- inline
- boolvec<float,1> boolvec<float,1>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return v ? x : y;
- }
-
- inline intvec<float,1> boolvec<float,1>::ifthen(intvec_t x, intvec_t y) const
- {
- return v ? x : y;
- }
-
- inline
- realvec<float,1> boolvec<float,1>::ifthen(realvec_t x, realvec_t y) const
- {
- return v ? x : y;
- }
-
-
-
- // intvec definitions
-
- inline realvec<float,1> intvec<float,1>::as_float() const
- {
- return FP::as_float(v);
- }
-
- inline intvec<float,1> intvec<float,1>::bitifthen(intvec_t x,
- intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
- inline realvec<float,1> intvec<float,1>::convert_float() const
- {
- // return FP::convert_float(v);
- return _mm_cvtss_f32(_mm_cvtsi32_ss(_mm_setzero_ps(), v));
}
-
- inline intvec<float,1> intvec<float,1>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
- inline intvec<float,1> intvec<float,1>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
+};
+
+// boolvec definitions
+
+inline intvec<float, 1> boolvec<float, 1>::as_int() const { return I(v); }
+
+inline intvec<float, 1> boolvec<float, 1>::convert_int() const { return v; }
+
+inline boolvec<float, 1> boolvec<float, 1>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return v ? x : y;
+}
+
+inline intvec<float, 1> boolvec<float, 1>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return v ? x : y;
+}
+
+inline realvec<float, 1> boolvec<float, 1>::ifthen(realvec_t x,
+ realvec_t y) const {
+ return v ? x : y;
+}
+
+// intvec definitions
+
+inline realvec<float, 1> intvec<float, 1>::as_float() const {
+ return FP::as_float(v);
+}
+
+inline intvec<float, 1> intvec<float, 1>::bitifthen(intvec_t x,
+ intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+}
+
+inline realvec<float, 1> intvec<float, 1>::convert_float() const {
+ // return FP::convert_float(v);
+ return _mm_cvtss_f32(_mm_cvtsi32_ss(_mm_setzero_ps(), v));
+}
+
+inline intvec<float, 1> intvec<float, 1>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 1> intvec<float, 1>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_SSE_FLOAT1_H
+#endif // #ifndef VEC_SSE_FLOAT1_H
diff --git a/vec_sse_float4.h b/vec_sse_float4.h
index 940de67..f8e8e80 100644
--- a/vec_sse_float4.h
+++ b/vec_sse_float4.h
@@ -11,766 +11,642 @@
// SSE2 intrinsics
#include <xmmintrin.h>
-#ifdef __SSE3__ // Intel's SSE 3
-# include <pmmintrin.h>
+#ifdef __SSE3__ // Intel's SSE 3
+#include <pmmintrin.h>
#endif
-#ifdef __SSSE3__ // Intel's SSSE 3
-# include <tmmintrin.h>
+#ifdef __SSSE3__ // Intel's SSSE 3
+#include <tmmintrin.h>
#endif
-#if defined __SSE4_1__ // Intel's SSE 4.1
-# include <smmintrin.h>
+#if defined __SSE4_1__ // Intel's SSE 4.1
+#include <smmintrin.h>
#endif
-#if defined __SSE4A__ // AMD's SSE 4a
-# include <ammintrin.h>
+#if defined __SSE4A__ // AMD's SSE 4a
+#include <ammintrin.h>
#endif
-#if defined __AVX__ // Intel's AVX
-# include <immintrin.h>
+#if defined __AVX__ // Intel's AVX
+#include <immintrin.h>
#endif
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_FLOAT_4
- template<> struct boolvec<float,4>;
- template<> struct intvec<float,4>;
- template<> struct realvec<float,4>;
-
-
-
- template<>
- struct boolvec<float,4>: floatprops<float>
- {
- static int const size = 4;
- typedef bool scalar_t;
- typedef __m128 bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- private:
- // true values have the sign bit set, false values have it unset
- static uint_t from_bool(bool a) { return - int_t(a); }
- static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
- public:
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(boolvec const& x): v(x.v) {}
- // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a):
- v(_mm_castsi128_ps(_mm_set1_epi32(from_bool(a)))) {}
- boolvec(bool const* as):
- v(_mm_castsi128_ps(_mm_set_epi32(from_bool(as[3]),
- from_bool(as[2]),
- from_bool(as[1]),
- from_bool(as[0])))) {}
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const
- {
- return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
- }
- boolvec_t& set_elt(int n, bool a)
- {
- return
- vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
- }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec_t operator!() const { return _mm_xor_ps(boolvec(true), v); }
-
- boolvec_t operator&&(boolvec_t x) const { return _mm_and_ps(v, x.v); }
- boolvec_t operator||(boolvec_t x) const { return _mm_or_ps(v, x.v); }
- boolvec_t operator==(boolvec_t x) const { return !(*this!=x); }
- boolvec_t operator!=(boolvec_t x) const { return _mm_xor_ps(v, x.v); }
-
- bool all() const
- {
- // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
+template <> struct boolvec<float, 4>;
+template <> struct intvec<float, 4>;
+template <> struct realvec<float, 4>;
+
+template <> struct boolvec<float, 4> : floatprops<float> {
+ static int const size = 4;
+ typedef bool scalar_t;
+ typedef __m128 bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+private:
+ // true values have the sign bit set, false values have it unset
+ static uint_t from_bool(bool a) { return -int_t(a); }
+ static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+
+public:
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a) : v(_mm_castsi128_ps(_mm_set1_epi32(from_bool(a)))) {}
+ boolvec(bool const *as)
+ : v(_mm_castsi128_ps(_mm_set_epi32(from_bool(as[3]), from_bool(as[2]),
+ from_bool(as[1]), from_bool(as[0])))) {
+ }
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const {
+ return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+ }
+ boolvec_t &set_elt(int n, bool a) {
+ return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+ *this;
+ }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec_t operator!() const { return _mm_xor_ps(boolvec(true), v); }
+
+ boolvec_t operator&&(boolvec_t x) const { return _mm_and_ps(v, x.v); }
+ boolvec_t operator||(boolvec_t x) const { return _mm_or_ps(v, x.v); }
+ boolvec_t operator==(boolvec_t x) const { return !(*this != x); }
+ boolvec_t operator!=(boolvec_t x) const { return _mm_xor_ps(v, x.v); }
+
+ bool all() const {
+// return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3];
#if defined __AVX__
- return ! (! *this).any();
+ return !(!*this).any();
#else
- boolvec_t x = *this;
- x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1));
- return x[0] && x[2];
+ boolvec_t x = *this;
+ x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1));
+ return x[0] && x[2];
#endif
- }
- bool any() const
- {
- // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
+ }
+ bool any() const {
+// return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3];
#if defined __AVX__
- return ! bool(_mm_testz_ps(v, v));
+ return !bool(_mm_testz_ps(v, v));
#else
- boolvec_t x = *this;
- x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1));
- return x[0] || x[2];
+ boolvec_t x = *this;
+ x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1));
+ return x[0] || x[2];
#endif
+ }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<float, 4> : floatprops<float> {
+ static int const size = 4;
+ typedef int_t scalar_t;
+ typedef __m128i ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x) : v(x) {}
+ intvec(int_t a) : v(_mm_set1_epi32(a)) {}
+ intvec(int_t const *as) : v(_mm_set_epi32(as[3], as[2], as[1], as[0])) {}
+ static intvec_t iota() { return _mm_set_epi32(3, 2, 1, 0); }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const {
+ return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+ }
+ intvec_t &set_elt(int n, int_t a) {
+ return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+ }
+
+ boolvec_t as_bool() const { return _mm_castsi128_ps(v); }
+ boolvec_t convert_bool() const {
+ // Result: convert_bool(0)=false, convert_bool(else)=true
+ return !IV(_mm_cmpeq_epi32(v, IV(0))).as_bool();
+ }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ // Note: not all arithmetic operations are supported!
+
+ intvec_t operator+() const { return *this; }
+ intvec_t operator-() const { return IV(0) - *this; }
+
+ intvec_t operator+(intvec_t x) const { return _mm_add_epi32(v, x.v); }
+ intvec_t operator-(intvec_t x) const { return _mm_sub_epi32(v, x.v); }
+
+ intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; }
+ intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; }
+
+ intvec_t operator~() const { return IV(~U(0)) ^ *this; }
+
+ intvec_t operator&(intvec_t x) const {
+ return _mm_castps_si128(
+ _mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(x.v)));
+ }
+ intvec_t operator|(intvec_t x) const {
+ return _mm_castps_si128(
+ _mm_or_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(x.v)));
+ }
+ intvec_t operator^(intvec_t x) const {
+ return _mm_castps_si128(
+ _mm_xor_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(x.v)));
+ }
+
+ intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; }
+ intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; }
+ intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+ intvec_t lsr(int_t n) const { return _mm_srli_epi32(v, n); }
+ intvec_t rotate(int_t n) const;
+ intvec_t operator>>(int_t n) const { return _mm_srai_epi32(v, n); }
+ intvec_t operator<<(int_t n) const { return _mm_slli_epi32(v, n); }
+ intvec_t &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec_t lsr(intvec_t n) const {
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, U((*this)[i]) >> U(n[i]));
}
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<float,4>: floatprops<float>
- {
- static int const size = 4;
- typedef int_t scalar_t;
- typedef __m128i ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(intvec const& x): v(x.v) {}
- // intvec& operator=(intvec const& x) { return v=x.v, *this; }
- intvec(ivector_t x): v(x) {}
- intvec(int_t a): v(_mm_set1_epi32(a)) {}
- intvec(int_t const* as): v(_mm_set_epi32(as[3], as[2], as[1], as[0])) {}
- static intvec_t iota() { return _mm_set_epi32(3, 2, 1, 0); }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const
- {
- return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
- }
- intvec_t& set_elt(int n, int_t a)
- {
- return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
- }
-
-
-
- boolvec_t as_bool() const { return _mm_castsi128_ps(v); }
- boolvec_t convert_bool() const
- {
- // Result: convert_bool(0)=false, convert_bool(else)=true
- return ! IV(_mm_cmpeq_epi32(v, IV(0))).as_bool();
- }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- // Note: not all arithmetic operations are supported!
-
- intvec_t operator+() const { return *this; }
- intvec_t operator-() const { return IV(0) - *this; }
-
- intvec_t operator+(intvec_t x) const { return _mm_add_epi32(v, x.v); }
- intvec_t operator-(intvec_t x) const { return _mm_sub_epi32(v, x.v); }
-
- intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; }
- intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; }
-
-
-
- intvec_t operator~() const { return IV(~U(0)) ^ *this; }
-
- intvec_t operator&(intvec_t x) const
- {
- return _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v),
- _mm_castsi128_ps(x.v)));
- }
- intvec_t operator|(intvec_t x) const
- {
- return _mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(v),
- _mm_castsi128_ps(x.v)));
- }
- intvec_t operator^(intvec_t x) const
- {
- return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(v),
- _mm_castsi128_ps(x.v)));
- }
-
- intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; }
- intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; }
- intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const;
-
-
-
- intvec_t lsr(int_t n) const { return _mm_srli_epi32(v, n); }
- intvec_t rotate(int_t n) const;
- intvec_t operator>>(int_t n) const { return _mm_srai_epi32(v, n); }
- intvec_t operator<<(int_t n) const { return _mm_slli_epi32(v, n); }
- intvec_t& operator>>=(int_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec_t lsr(intvec_t n) const
- {
- intvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, U((*this)[i]) >> U(n[i]));
- }
- return r;
- }
- intvec_t rotate(intvec_t n) const;
- intvec_t operator>>(intvec_t n) const
- {
- intvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] >> n[i]);
- }
- return r;
- }
- intvec_t operator<<(intvec_t n) const
- {
- intvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] << n[i]);
- }
- return r;
- }
- intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; }
- intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; }
-
- intvec_t clz() const;
- intvec_t popcount() const;
-
-
-
- boolvec_t operator==(intvec_t const& x) const
- {
- return ! (*this != x);
- }
- boolvec_t operator!=(intvec_t const& x) const
- {
- return (*this ^ x).convert_bool();
- }
- boolvec_t operator<(intvec_t const& x) const
- {
- // return (*this - x).as_bool();
- boolvec_t r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] < x[i]);
- }
- return r;
- }
- boolvec_t operator<=(intvec_t const& x) const
- {
- return ! (*this > x);
- }
- boolvec_t operator>(intvec_t const& x) const
- {
- return x < *this;
- }
- boolvec_t operator>=(intvec_t const& x) const
- {
- return ! (*this < x);
- }
-
- intvec_t abs() const;
- boolvec_t isignbit() const { return as_bool(); }
- intvec_t max(intvec_t x) const;
- intvec_t min(intvec_t x) const;
- };
-
-
-
- template<>
- struct realvec<float,4>: floatprops<float>
- {
- static int const size = 4;
- typedef real_t scalar_t;
- typedef __m128 vector_t;
- static int const alignment = sizeof(vector_t);
-
- static char const* name() { return "<SSE2:4*float>"; }
- void barrier() { __asm__("": "+x"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(realvec const& x): v(x.v) {}
- // realvec& operator=(realvec const& x) { return v=x.v, *this; }
- realvec(vector_t x): v(x) {}
- realvec(real_t a): v(_mm_set1_ps(a)) {}
- realvec(real_t const* as): v(_mm_set_ps(as[3], as[2], as[1], as[0])) {}
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const
- {
- return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
- }
- realvec_t& set_elt(int n, real_t a)
- {
- return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
- }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return _mm_load_ps(p);
- }
- static realvec_t loadu(real_t const* p)
- {
- return _mm_loadu_ps(p);
- }
- static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff);
- if (ioff==0) return loada(p);
- return loadu(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return m.m.ifthen(loada(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return m.m.ifthen(loadu(p), *this);
- }
+ return r;
+ }
+ intvec_t rotate(intvec_t n) const;
+ intvec_t operator>>(intvec_t n) const {
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] >> n[i]);
}
- realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff, m);
- return loadu(p+ioff, m);
+ return r;
+ }
+ intvec_t operator<<(intvec_t n) const {
+ intvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] << n[i]);
}
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- _mm_store_ps(p, v);
+ return r;
+ }
+ intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; }
+ intvec_t &operator<<=(intvec_t n) { return *this = *this << n; }
+
+ intvec_t clz() const;
+ intvec_t popcount() const;
+
+ boolvec_t operator==(intvec_t const &x) const { return !(*this != x); }
+ boolvec_t operator!=(intvec_t const &x) const {
+ return (*this ^ x).convert_bool();
+ }
+ boolvec_t operator<(intvec_t const &x) const {
+ // return (*this - x).as_bool();
+ boolvec_t r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] < x[i]);
}
- void storeu(real_t* p) const
- {
- return _mm_storeu_ps(p, v);
+ return r;
+ }
+ boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); }
+ boolvec_t operator>(intvec_t const &x) const { return x < *this; }
+ boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); }
+
+ intvec_t abs() const;
+ boolvec_t isignbit() const { return as_bool(); }
+ intvec_t max(intvec_t x) const;
+ intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<float, 4> : floatprops<float> {
+ static int const size = 4;
+ typedef real_t scalar_t;
+ typedef __m128 vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static char const *name() { return "<SSE2:4*float>"; }
+ void barrier() { __asm__("" : "+x"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x) : v(x) {}
+ realvec(real_t a) : v(_mm_set1_ps(a)) {}
+ realvec(real_t const *as) : v(_mm_set_ps(as[3], as[2], as[1], as[0])) {}
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const {
+ return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+ }
+ realvec_t &set_elt(int n, real_t a) {
+ return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+ }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return _mm_load_ps(p);
+ }
+ static realvec_t loadu(real_t const *p) { return _mm_loadu_ps(p); }
+ static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff);
+ if (ioff == 0)
+ return loada(p);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
}
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff);
- storeu(p+ioff);
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
}
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- } else {
+ }
+ realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff, m);
+ return loadu(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ _mm_store_ps(p, v);
+ }
+ void storeu(real_t *p) const { return _mm_storeu_ps(p, v); }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
#if defined __AVX__
- _mm_maskstore_ps(p, m.m.as_int(), v);
+ _mm_maskstore_ps(p, m.m.as_int(), v);
#else
- if (m.m[0]) p[0] = (*this)[0];
- if (m.m[1]) p[1] = (*this)[1];
- if (m.m[2]) p[2] = (*this)[2];
- if (m.m[3]) p[3] = (*this)[3];
+ if (m.m[0])
+ p[0] = (*this)[0];
+ if (m.m[1])
+ p[1] = (*this)[1];
+ if (m.m[2])
+ p[2] = (*this)[2];
+ if (m.m[3])
+ p[3] = (*this)[3];
#endif
- }
- }
- void storeu(real_t* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- } else {
- if (m.m[0]) p[0] = (*this)[0];
- if (m.m[1]) p[1] = (*this)[1];
- if (m.m[2]) p[2] = (*this)[2];
- if (m.m[3]) p[3] = (*this)[3];
- }
- }
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff, m);
- storeu(p+ioff, m);
- }
-
-
-
- intvec_t as_int() const { return _mm_castps_si128(v); }
- intvec_t convert_int() const { return _mm_cvttps_epi32(v); }
-
-
-
- realvec_t operator+() const { return *this; }
- realvec_t operator-() const { return RV(0.0) - *this; }
-
- realvec_t operator+(realvec_t x) const { return _mm_add_ps(v, x.v); }
- realvec_t operator-(realvec_t x) const { return _mm_sub_ps(v, x.v); }
- realvec_t operator*(realvec_t x) const { return _mm_mul_ps(v, x.v); }
- realvec_t operator/(realvec_t x) const { return _mm_div_ps(v, x.v); }
-
- realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; }
- realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; }
- realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; }
- realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; }
-
- real_t maxval() const
- {
- // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
- // vml_std::fmax((*this)[2], (*this)[3]));
- realvec_t x0123 = *this;
- realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
- realvec_t y0022 = x0123.fmax(x1032);
- return vml_std::fmax(y0022[0], y0022[2]);
- }
- real_t minval() const
- {
- // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
- // vml_std::fmin((*this)[2], (*this)[3]));
- realvec_t x0123 = *this;
- realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
- realvec_t y0022 = x0123.fmin(x1032);
- return vml_std::fmin(y0022[0], y0022[2]);
}
- real_t prod() const
- {
- // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
- realvec_t x0123 = *this;
- realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
- realvec_t y0022 = x0123 * x1032;
- return y0022[0] * y0022[2];
+ }
+ void storeu(real_t *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ if (m.m[0])
+ p[0] = (*this)[0];
+ if (m.m[1])
+ p[1] = (*this)[1];
+ if (m.m[2])
+ p[2] = (*this)[2];
+ if (m.m[3])
+ p[3] = (*this)[3];
}
- real_t sum() const
- {
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff, m);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return _mm_castps_si128(v); }
+ intvec_t convert_int() const { return _mm_cvttps_epi32(v); }
+
+ realvec_t operator+() const { return *this; }
+ realvec_t operator-() const { return RV(0.0) - *this; }
+
+ realvec_t operator+(realvec_t x) const { return _mm_add_ps(v, x.v); }
+ realvec_t operator-(realvec_t x) const { return _mm_sub_ps(v, x.v); }
+ realvec_t operator*(realvec_t x) const { return _mm_mul_ps(v, x.v); }
+ realvec_t operator/(realvec_t x) const { return _mm_div_ps(v, x.v); }
+
+ realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; }
+ realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; }
+ realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; }
+ realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; }
+
+ real_t maxval() const {
+ // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]),
+ // vml_std::fmax((*this)[2], (*this)[3]));
+ realvec_t x0123 = *this;
+ realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+ realvec_t y0022 = x0123.fmax(x1032);
+ return vml_std::fmax(y0022[0], y0022[2]);
+ }
+ real_t minval() const {
+ // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]),
+ // vml_std::fmin((*this)[2], (*this)[3]));
+ realvec_t x0123 = *this;
+ realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+ realvec_t y0022 = x0123.fmin(x1032);
+ return vml_std::fmin(y0022[0], y0022[2]);
+ }
+ real_t prod() const {
+ // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3];
+ realvec_t x0123 = *this;
+ realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+ realvec_t y0022 = x0123 * x1032;
+ return y0022[0] * y0022[2];
+ }
+ real_t sum() const {
#ifdef __SSE3__
- realvec_t x = *this;
- x = _mm_hadd_ps(x.v, x.v);
- x = _mm_hadd_ps(x.v, x.v);
- return x[0];
+ realvec_t x = *this;
+ x = _mm_hadd_ps(x.v, x.v);
+ x = _mm_hadd_ps(x.v, x.v);
+ return x[0];
#else
- // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
- realvec_t x0123 = *this;
- realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
- realvec_t y0022 = x0123 + x1032;
- return y0022[0] + y0022[2];
+ // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3];
+ realvec_t x0123 = *this;
+ realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001);
+ realvec_t y0022 = x0123 + x1032;
+ return y0022[0] + y0022[2];
#endif
- }
-
-
-
- boolvec_t operator==(realvec_t const& x) const
- {
- return _mm_cmpeq_ps(v, x.v);
- }
- boolvec_t operator!=(realvec_t const& x) const
- {
- return _mm_cmpneq_ps(v, x.v);
- }
- boolvec_t operator<(realvec_t const& x) const
- {
- return _mm_cmplt_ps(v, x.v);
- }
- boolvec_t operator<=(realvec_t const& x) const
- {
- return _mm_cmple_ps(v, x.v);
- }
- boolvec_t operator>(realvec_t const& x) const
- {
- return _mm_cmpgt_ps(v, x.v);
- }
- boolvec_t operator>=(realvec_t const& x) const
- {
- return _mm_cmpge_ps(v, x.v);
- }
-
-
-
- realvec_t acos() const { return MF::vml_acos(*this); }
- realvec_t acosh() const { return MF::vml_acosh(*this); }
- realvec_t asin() const { return MF::vml_asin(*this); }
- realvec_t asinh() const { return MF::vml_asinh(*this); }
- realvec_t atan() const { return MF::vml_atan(*this); }
- realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
- realvec_t atanh() const { return MF::vml_atanh(*this); }
- realvec_t cbrt() const { return MF::vml_cbrt(*this); }
- realvec_t ceil() const
- {
+ }
+
+ boolvec_t operator==(realvec_t const &x) const {
+ return _mm_cmpeq_ps(v, x.v);
+ }
+ boolvec_t operator!=(realvec_t const &x) const {
+ return _mm_cmpneq_ps(v, x.v);
+ }
+ boolvec_t operator<(realvec_t const &x) const { return _mm_cmplt_ps(v, x.v); }
+ boolvec_t operator<=(realvec_t const &x) const {
+ return _mm_cmple_ps(v, x.v);
+ }
+ boolvec_t operator>(realvec_t const &x) const { return _mm_cmpgt_ps(v, x.v); }
+ boolvec_t operator>=(realvec_t const &x) const {
+ return _mm_cmpge_ps(v, x.v);
+ }
+
+ realvec_t acos() const { return MF::vml_acos(*this); }
+ realvec_t acosh() const { return MF::vml_acosh(*this); }
+ realvec_t asin() const { return MF::vml_asin(*this); }
+ realvec_t asinh() const { return MF::vml_asinh(*this); }
+ realvec_t atan() const { return MF::vml_atan(*this); }
+ realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+ realvec_t atanh() const { return MF::vml_atanh(*this); }
+ realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+ realvec_t ceil() const {
#ifdef __SSE4_1__
- return _mm_ceil_ps(v);
+ return _mm_ceil_ps(v);
#else
- return MF::vml_ceil(*this);
+ return MF::vml_ceil(*this);
#endif
- }
- realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
- realvec_t cos() const { return MF::vml_cos(*this); }
- realvec_t cosh() const { return MF::vml_cosh(*this); }
- realvec_t exp() const { return MF::vml_exp(*this); }
- realvec_t exp10() const { return MF::vml_exp10(*this); }
- realvec_t exp2() const { return MF::vml_exp2(*this); }
- realvec_t expm1() const { return MF::vml_expm1(*this); }
- realvec_t fabs() const { return MF::vml_fabs(*this); }
- realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
- realvec_t floor() const
- {
+ }
+ realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+ realvec_t cos() const { return MF::vml_cos(*this); }
+ realvec_t cosh() const { return MF::vml_cosh(*this); }
+ realvec_t exp() const { return MF::vml_exp(*this); }
+ realvec_t exp10() const { return MF::vml_exp10(*this); }
+ realvec_t exp2() const { return MF::vml_exp2(*this); }
+ realvec_t expm1() const { return MF::vml_expm1(*this); }
+ realvec_t fabs() const { return MF::vml_fabs(*this); }
+ realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+ realvec_t floor() const {
#ifdef __SSE4_1__
- return _mm_floor_ps(v);
+ return _mm_floor_ps(v);
#else
- return MF::vml_floor(*this);
+ return MF::vml_floor(*this);
#endif
- }
- realvec_t fma(realvec_t y, realvec_t z) const
- {
- return MF::vml_fma(*this, y, z);
- }
- realvec_t fmax(realvec_t y) const { return _mm_max_ps(v, y.v); }
- realvec_t fmin(realvec_t y) const { return _mm_min_ps(v, y.v); }
- realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
- realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
- realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
- intvec_t ilogb() const { return MF::vml_ilogb(*this); }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const
- {
+ }
+ realvec_t fma(realvec_t y, realvec_t z) const {
+ return MF::vml_fma(*this, y, z);
+ }
+ realvec_t fmax(realvec_t y) const { return _mm_max_ps(v, y.v); }
+ realvec_t fmin(realvec_t y) const { return _mm_min_ps(v, y.v); }
+ realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+ realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+ realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const {
#if defined VML_HAVE_NAN
- return _mm_cmpunord_ps(v, v);
+ return _mm_cmpunord_ps(v, v);
#else
- return BV(false);
+ return BV(false);
#endif
- }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
- realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
- realvec_t log() const { return MF::vml_log(*this); }
- realvec_t log10() const { return MF::vml_log10(*this); }
- realvec_t log1p() const { return MF::vml_log1p(*this); }
- realvec_t log2() const { return MF::vml_log2(*this); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return MF::vml_mad(*this, y, z);
- }
- realvec_t nextafter(realvec_t y) const
- {
- return MF::vml_nextafter(*this, y);
- }
- realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
- realvec_t rcp() const
- {
- realvec_t x = *this;
- realvec_t r = _mm_rcp_ps(x); // this is only an approximation
- r *= RV(2.0) - r*x; // one Newton iteration (see vml_rcp)
- return r;
- }
- realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
- realvec_t rint() const
- {
+ }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec_t log() const { return MF::vml_log(*this); }
+ realvec_t log10() const { return MF::vml_log10(*this); }
+ realvec_t log1p() const { return MF::vml_log1p(*this); }
+ realvec_t log2() const { return MF::vml_log2(*this); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return MF::vml_mad(*this, y, z);
+ }
+ realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+ realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+ realvec_t rcp() const {
+ realvec_t x = *this;
+ realvec_t r = _mm_rcp_ps(x); // this is only an approximation
+ r *= RV(2.0) - r * x; // one Newton iteration (see vml_rcp)
+ return r;
+ }
+ realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+ realvec_t rint() const {
#ifdef __SSE4_1__
- return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
+ return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT);
#else
- return MF::vml_rint(*this);
+ return MF::vml_rint(*this);
#endif
- }
- realvec_t round() const { return MF::vml_round(*this); }
- realvec_t rsqrt() const
- {
- realvec_t x = *this;
- realvec_t r = _mm_rsqrt_ps(x); // this is only an approximation
- r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt)
- return r;
- }
- boolvec_t signbit() const { return v; }
- realvec_t sin() const { return MF::vml_sin(*this); }
- realvec_t sinh() const { return MF::vml_sinh(*this); }
- realvec_t sqrt() const { return _mm_sqrt_ps(v); }
- realvec_t tan() const { return MF::vml_tan(*this); }
- realvec_t tanh() const { return MF::vml_tanh(*this); }
- realvec_t trunc() const
- {
+ }
+ realvec_t round() const { return MF::vml_round(*this); }
+ realvec_t rsqrt() const {
+ realvec_t x = *this;
+ realvec_t r = _mm_rsqrt_ps(x); // this is only an approximation
+ r *= RV(1.5) - RV(0.5) * x * r * r; // one Newton iteration (see vml_rsqrt)
+ return r;
+ }
+ boolvec_t signbit() const { return v; }
+ realvec_t sin() const { return MF::vml_sin(*this); }
+ realvec_t sinh() const { return MF::vml_sinh(*this); }
+ realvec_t sqrt() const { return _mm_sqrt_ps(v); }
+ realvec_t tan() const { return MF::vml_tan(*this); }
+ realvec_t tanh() const { return MF::vml_tanh(*this); }
+ realvec_t trunc() const {
#ifdef __SSE4_1__
- return _mm_round_ps(v, _MM_FROUND_TO_ZERO);
+ return _mm_round_ps(v, _MM_FROUND_TO_ZERO);
#else
- return MF::vml_trunc(*this);
+ return MF::vml_trunc(*this);
#endif
- }
- };
-
-
-
- // boolvec definitions
-
- inline intvec<float,4> boolvec<float,4>::as_int() const
- {
- return _mm_castps_si128(v);
- }
-
- inline intvec<float,4> boolvec<float,4>::convert_int() const
- {
- return lsr(as_int(), bits-1);
- }
-
- inline
- boolvec<float,4> boolvec<float,4>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return ifthen(x.as_int(), y.as_int()).as_bool();
- }
-
- inline intvec<float,4> boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const
- {
- return ifthen(x.as_float(), y.as_float()).as_int();
- }
-
- inline
- realvec<float,4> boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const
- {
+ }
+};
+
+// boolvec definitions
+
+inline intvec<float, 4> boolvec<float, 4>::as_int() const {
+ return _mm_castps_si128(v);
+}
+
+inline intvec<float, 4> boolvec<float, 4>::convert_int() const {
+ return lsr(as_int(), bits - 1);
+}
+
+inline boolvec<float, 4> boolvec<float, 4>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return ifthen(x.as_int(), y.as_int()).as_bool();
+}
+
+inline intvec<float, 4> boolvec<float, 4>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return ifthen(x.as_float(), y.as_float()).as_int();
+}
+
+inline realvec<float, 4> boolvec<float, 4>::ifthen(realvec_t x,
+ realvec_t y) const {
#ifdef __SSE4_1__
- return _mm_blendv_ps(y.v, x.v, v);
+ return _mm_blendv_ps(y.v, x.v, v);
#else
- return (( -convert_int() & x.as_int()) |
- (~-convert_int() & y.as_int())).as_float();
+ return ((-convert_int() & x.as_int()) | (~ - convert_int() & y.as_int()))
+ .as_float();
#endif
- }
+}
+
+// intvec definitions
-
-
- // intvec definitions
-
- inline intvec<float,4> intvec<float,4>::abs() const
- {
+inline intvec<float, 4> intvec<float, 4>::abs() const {
#ifdef __SSSE3__
- return _mm_abs_epi32(v);
+ return _mm_abs_epi32(v);
#else
- return MF::vml_abs(*this);
+ return MF::vml_abs(*this);
#endif
- }
-
- inline realvec<float,4> intvec<float,4>::as_float() const
- {
- return _mm_castsi128_ps(v);
- }
-
- inline intvec<float,4> intvec<float,4>::bitifthen(intvec_t x,
- intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
- inline intvec<float,4> intvec<float,4>::clz() const
- {
- return MF::vml_clz(*this);
- }
-
- inline realvec<float,4> intvec<float,4>::convert_float() const
- {
- return _mm_cvtepi32_ps(v);
- }
-
- inline intvec<float,4> intvec<float,4>::max(intvec_t x) const
- {
+}
+
+inline realvec<float, 4> intvec<float, 4>::as_float() const {
+ return _mm_castsi128_ps(v);
+}
+
+inline intvec<float, 4> intvec<float, 4>::bitifthen(intvec_t x,
+ intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<float, 4> intvec<float, 4>::clz() const {
+ return MF::vml_clz(*this);
+}
+
+inline realvec<float, 4> intvec<float, 4>::convert_float() const {
+ return _mm_cvtepi32_ps(v);
+}
+
+inline intvec<float, 4> intvec<float, 4>::max(intvec_t x) const {
#ifdef __SSE4_1__
- return _mm_max_epi32(v, x.v);
+ return _mm_max_epi32(v, x.v);
#else
- return MF::vml_max(*this, x);
+ return MF::vml_max(*this, x);
#endif
- }
-
- inline intvec<float,4> intvec<float,4>::min(intvec_t x) const
- {
+}
+
+inline intvec<float, 4> intvec<float, 4>::min(intvec_t x) const {
#ifdef __SSE4_1__
- return _mm_min_epi32(v, x.v);
+ return _mm_min_epi32(v, x.v);
#else
- return MF::vml_min(*this, x);
+ return MF::vml_min(*this, x);
#endif
- }
-
- inline intvec<float,4> intvec<float,4>::popcount() const
- {
- return MF::vml_popcount(*this);
- }
-
- inline intvec<float,4> intvec<float,4>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
- inline intvec<float,4> intvec<float,4>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
+}
+
+inline intvec<float, 4> intvec<float, 4>::popcount() const {
+ return MF::vml_popcount(*this);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+inline intvec<float, 4> intvec<float, 4>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_SSE_FLOAT4_H
+#endif // #ifndef VEC_SSE_FLOAT4_H
diff --git a/vec_test.h b/vec_test.h
index 46fc9d1..c27b75e 100644
--- a/vec_test.h
+++ b/vec_test.h
@@ -9,1474 +9,1280 @@
#include <cmath>
#ifndef VML_NO_IOSTREAM
-# include <sstream>
+#include <sstream>
#endif
+namespace vecmathlib {
+template <typename T, int N> struct booltestvec;
+template <typename T, int N> struct inttestvec;
+template <typename T, int N> struct realtestvec;
+
+template <typename T, int N> struct booltestvec : floatprops<T> {
+ typedef typename floatprops<T>::int_t int_t;
+ typedef typename floatprops<T>::uint_t uint_t;
+ typedef typename floatprops<T>::real_t real_t;
+
+ static int const size = N;
+ typedef bool scalar_t;
+ typedef bool bvector_t[size];
+ static int const alignment = sizeof(bool);
+
+ typedef booltestvec boolvec_t;
+ typedef inttestvec<real_t, size> intvec_t;
+ typedef realtestvec<real_t, size> realvec_t;
+
+ // short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ booltestvec() {}
+ // can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // booltestvec(booltestvec const& x): v(x.v) {}
+ // booltestvec& operator=(booltestvec const& x) { return v=x.v, *this; }
+ // booltestvec(vector_t x): v(x) {}
+ booltestvec(bool a) {
+ for (int d = 0; d < size; ++d)
+ v[d] = a;
+ }
+ booltestvec(bool const *as) {
+ for (int d = 0; d < size; ++d)
+ v[d] = as[d];
+ }
+
+ bool operator[](int n) const { return v[n]; }
+ boolvec_t &set_elt(int n, bool a) { return v[n] = a, *this; }
+
+ intvec_t as_int() const; // defined after inttestvec
+ intvec_t convert_int() const; // defined after inttestvec
+
+ boolvec_t operator!() const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = !v[d];
+ return res;
+ }
+
+ boolvec_t operator&&(boolvec_t x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] && x.v[d];
+ return res;
+ }
+ boolvec_t operator||(boolvec_t x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] || x.v[d];
+ return res;
+ }
+ boolvec_t operator==(boolvec_t x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] == x.v[d];
+ return res;
+ }
+ boolvec_t operator!=(boolvec_t x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] != x.v[d];
+ return res;
+ }
+
+ bool all() const {
+ bool res = v[0];
+ for (int d = 1; d < size; ++d)
+ res = res && v[d];
+ return res;
+ }
+ bool any() const {
+ bool res = v[0];
+ for (int d = 1; d < size; ++d)
+ res = res || v[d];
+ return res;
+ }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after inttestvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realtestvec
+};
+
+template <typename T, int N> struct inttestvec : floatprops<T> {
+ typedef typename floatprops<T>::int_t int_t;
+ typedef typename floatprops<T>::uint_t uint_t;
+ typedef typename floatprops<T>::real_t real_t;
+
+ static int const size = N;
+ typedef int_t scalar_t;
+ typedef int_t ivector_t[size];
+ static int const alignment = sizeof(int_t);
+
+ typedef booltestvec<real_t, size> boolvec_t;
+ typedef inttestvec intvec_t;
+ typedef realtestvec<real_t, size> realvec_t;
+
+ // short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ inttestvec() {}
+ // can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // inttestvec(inttestvec const& x): v(x.v) {}
+ // inttestvec& operator=(inttestvec const& x) { return v=x.v, *this; }
+ // inttestvec(vector_t x): v(x) {}
+ inttestvec(int_t a) {
+ for (int d = 0; d < size; ++d)
+ v[d] = a;
+ }
+ inttestvec(int_t const *as) {
+ for (int d = 0; d < size; ++d)
+ v[d] = as[d];
+ }
+ static intvec_t iota() {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = d;
+ return res;
+ }
+
+ int_t operator[](int n) const { return v[n]; }
+ intvec_t &set_elt(int n, int_t a) { return v[n] = a, *this; }
+
+ boolvec_t as_bool() const { return convert_bool(); }
+ boolvec_t convert_bool() const {
+ // result: convert_bool(0)=false, convert_bool(else)=true
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d];
+ return res;
+ }
+ realvec_t as_float() const; // defined after realtestvec
+ realvec_t convert_float() const; // defined after realtestvec
+
+ intvec_t operator+() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = +v[d];
+ return res;
+ }
+ intvec_t operator-() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = -v[d];
+ return res;
+ }
+
+ intvec_t &operator+=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] += x.v[d];
+ return *this;
+ }
+ intvec_t &operator-=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] -= x.v[d];
+ return *this;
+ }
+ intvec_t &operator*=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] *= x.v[d];
+ return *this;
+ }
+ intvec_t &operator/=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] /= x.v[d];
+ return *this;
+ }
+ intvec_t &operator%=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] %= x.v[d];
+ return *this;
+ }
+
+ intvec_t operator+(intvec_t x) const {
+ intvec_t res = *this;
+ return res += x;
+ }
+ intvec_t operator-(intvec_t x) const {
+ intvec_t res = *this;
+ return res -= x;
+ }
+ intvec_t operator*(intvec_t x) const {
+ intvec_t res = *this;
+ return res *= x;
+ }
+ intvec_t operator/(intvec_t x) const {
+ intvec_t res = *this;
+ return res /= x;
+ }
+ intvec_t operator%(intvec_t x) const {
+ intvec_t res = *this;
+ return res %= x;
+ }
+
+ intvec_t operator~() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = ~v[d];
+ return res;
+ }
+
+ intvec_t &operator&=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] &= x.v[d];
+ return *this;
+ }
+ intvec_t &operator|=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] |= x.v[d];
+ return *this;
+ }
+ intvec_t &operator^=(intvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] ^= x.v[d];
+ return *this;
+ }
+
+ intvec_t operator&(intvec_t x) const {
+ intvec_t res = *this;
+ return res &= x;
+ }
+ intvec_t operator|(intvec_t x) const {
+ intvec_t res = *this;
+ return res |= x;
+ }
+ intvec_t operator^(intvec_t x) const {
+ intvec_t res = *this;
+ return res ^= x;
+ }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+ }
+
+ intvec_t lsr(int_t n) const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = I(U(v[d]) >> U(n));
+ return res;
+ }
+ intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); }
+ intvec_t &operator>>=(int_t n) {
+ for (int d = 0; d < size; ++d)
+ v[d] >>= n;
+ return *this;
+ }
+ intvec_t &operator<<=(int_t n) {
+ for (int d = 0; d < size; ++d)
+ v[d] <<= n;
+ return *this;
+ }
+ intvec_t operator>>(int_t n) const {
+ intvec_t res = *this;
+ return res >>= n;
+ }
+ intvec_t operator<<(int_t n) const {
+ intvec_t res = *this;
+ return res <<= n;
+ }
+
+ intvec_t lsr(intvec_t n) const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = I(U(v[d]) >> U(n.v[d]));
+ return res;
+ }
+ intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); }
+ intvec_t &operator>>=(intvec_t n) {
+ for (int d = 0; d < size; ++d)
+ v[d] >>= n.v[d];
+ return *this;
+ }
+ intvec_t &operator<<=(intvec_t n) {
+ for (int d = 0; d < size; ++d)
+ v[d] <<= n.v[d];
+ return *this;
+ }
+ intvec_t operator>>(intvec_t n) const {
+ intvec_t res = *this;
+ return res >>= n;
+ }
+ intvec_t operator<<(intvec_t n) const {
+ intvec_t res = *this;
+ return res <<= n;
+ }
+
+ intvec_t clz() const { return MF::vml_clz(*this); }
+ intvec_t popcount() const { return MF::vml_popcount(*this); }
+
+ boolvec_t operator==(intvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] == x.v[d];
+ return res;
+ }
+ boolvec_t operator!=(intvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] != x.v[d];
+ return res;
+ }
+ boolvec_t operator<(intvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] < x.v[d];
+ return res;
+ }
+ boolvec_t operator<=(intvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] <= x.v[d];
+ return res;
+ }
+ boolvec_t operator>(intvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] > x.v[d];
+ return res;
+ }
+ boolvec_t operator>=(intvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] >= x.v[d];
+ return res;
+ }
+
+ intvec_t abs() const { return MF::vml_abs(*this); }
+ boolvec_t isignbit() const { return MF::vml_isignbit(*this); }
+ intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); }
+ intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); }
+};
+
+template <typename T, int N> struct realtestvec : floatprops<T> {
+ typedef typename floatprops<T>::int_t int_t;
+ typedef typename floatprops<T>::uint_t uint_t;
+ typedef typename floatprops<T>::real_t real_t;
+
+ static int const size = N;
+ typedef real_t scalar_t;
+ typedef real_t vector_t[size];
+ static int const alignment = sizeof(real_t);
-namespace vecmathlib {
-
- template<typename T, int N> struct booltestvec;
- template<typename T, int N> struct inttestvec;
- template<typename T, int N> struct realtestvec;
-
-
-
- template<typename T, int N>
- struct booltestvec: floatprops<T>
- {
- typedef typename floatprops<T>::int_t int_t;
- typedef typename floatprops<T>::uint_t uint_t;
- typedef typename floatprops<T>::real_t real_t;
-
- static int const size = N;
- typedef bool scalar_t;
- typedef bool bvector_t[size];
- static int const alignment = sizeof(bool);
-
- typedef booltestvec boolvec_t;
- typedef inttestvec<real_t, size> intvec_t;
- typedef realtestvec<real_t, size> realvec_t;
-
- // short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- booltestvec() {}
- // can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // booltestvec(booltestvec const& x): v(x.v) {}
- // booltestvec& operator=(booltestvec const& x) { return v=x.v, *this; }
- //booltestvec(vector_t x): v(x) {}
- booltestvec(bool a) { for (int d=0; d<size; ++d) v[d]=a; }
- booltestvec(bool const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-
- bool operator[](int n) const { return v[n]; }
- boolvec_t& set_elt(int n, bool a) { return v[n]=a, *this; }
-
-
-
- intvec_t as_int() const; // defined after inttestvec
- intvec_t convert_int() const; // defined after inttestvec
-
-
-
- boolvec_t operator!() const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = !v[d];
- return res;
- }
-
- boolvec_t operator&&(boolvec_t x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] && x.v[d];
- return res;
- }
- boolvec_t operator||(boolvec_t x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] || x.v[d];
- return res;
- }
- boolvec_t operator==(boolvec_t x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
- return res;
- }
- boolvec_t operator!=(boolvec_t x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
- return res;
- }
-
- bool all() const
- {
- bool res = v[0];
- for (int d=1; d<size; ++d) res = res && v[d];
- return res;
- }
- bool any() const
- {
- bool res = v[0];
- for (int d=1; d<size; ++d) res = res || v[d];
- return res;
- }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after inttestvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realtestvec
- };
-
-
-
- template<typename T, int N>
- struct inttestvec: floatprops<T>
- {
- typedef typename floatprops<T>::int_t int_t;
- typedef typename floatprops<T>::uint_t uint_t;
- typedef typename floatprops<T>::real_t real_t;
-
- static int const size = N;
- typedef int_t scalar_t;
- typedef int_t ivector_t[size];
- static int const alignment = sizeof(int_t);
-
- typedef booltestvec<real_t, size> boolvec_t;
- typedef inttestvec intvec_t;
- typedef realtestvec<real_t, size> realvec_t;
-
- // short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- inttestvec() {}
- // can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // inttestvec(inttestvec const& x): v(x.v) {}
- // inttestvec& operator=(inttestvec const& x) { return v=x.v, *this; }
- //inttestvec(vector_t x): v(x) {}
- inttestvec(int_t a) { for (int d=0; d<size; ++d) v[d]=a; }
- inttestvec(int_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
- static intvec_t iota()
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d]=d;
- return res;
- }
-
- int_t operator[](int n) const { return v[n]; }
- intvec_t& set_elt(int n, int_t a) { return v[n]=a, *this; }
-
-
-
- boolvec_t as_bool() const { return convert_bool(); }
- boolvec_t convert_bool() const
- {
- // result: convert_bool(0)=false, convert_bool(else)=true
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d]=v[d];
- return res;
- }
- realvec_t as_float() const; // defined after realtestvec
- realvec_t convert_float() const; // defined after realtestvec
-
-
-
- intvec_t operator+() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = + v[d];
- return res;
- }
- intvec_t operator-() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = - v[d];
- return res;
- }
-
- intvec_t& operator+=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] += x.v[d];
- return *this;
- }
- intvec_t& operator-=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] -= x.v[d];
- return *this;
- }
- intvec_t& operator*=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] *= x.v[d];
- return *this;
- }
- intvec_t& operator/=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] /= x.v[d];
- return *this;
- }
- intvec_t& operator%=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] %= x.v[d];
- return *this;
- }
-
- intvec_t operator+(intvec_t x) const
- {
- intvec_t res = *this;
- return res += x;
- }
- intvec_t operator-(intvec_t x) const
- {
- intvec_t res = *this;
- return res -= x;
- }
- intvec_t operator*(intvec_t x) const
- {
- intvec_t res = *this;
- return res *= x;
- }
- intvec_t operator/(intvec_t x) const
- {
- intvec_t res = *this;
- return res /= x;
- }
- intvec_t operator%(intvec_t x) const
- {
- intvec_t res = *this;
- return res %= x;
- }
-
-
-
- intvec_t operator~() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = ~ v[d];
- return res;
- }
-
- intvec_t& operator&=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] &= x.v[d];
- return *this;
- }
- intvec_t& operator|=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] |= x.v[d];
- return *this;
- }
- intvec_t& operator^=(intvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] ^= x.v[d];
- return *this;
- }
-
- intvec_t operator&(intvec_t x) const
- {
- intvec_t res = *this;
- return res &= x;
- }
- intvec_t operator|(intvec_t x) const
- {
- intvec_t res = *this;
- return res |= x;
- }
- intvec_t operator^(intvec_t x) const
- {
- intvec_t res = *this;
- return res ^= x;
- }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
-
-
- intvec_t lsr(int_t n) const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n));
- return res;
- }
- intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); }
- intvec_t& operator>>=(int_t n)
- {
- for (int d=0; d<size; ++d) v[d] >>= n;
- return *this;
- }
- intvec_t& operator<<=(int_t n)
- {
- for (int d=0; d<size; ++d) v[d] <<= n;
- return *this;
- }
- intvec_t operator>>(int_t n) const
- {
- intvec_t res = *this;
- return res >>= n;
- }
- intvec_t operator<<(int_t n) const
- {
- intvec_t res = *this;
- return res <<= n;
- }
-
- intvec_t lsr(intvec_t n) const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n.v[d]));
- return res;
- }
- intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); }
- intvec_t& operator>>=(intvec_t n)
- {
- for (int d=0; d<size; ++d) v[d] >>= n.v[d];
- return *this;
- }
- intvec_t& operator<<=(intvec_t n)
- {
- for (int d=0; d<size; ++d) v[d] <<= n.v[d];
- return *this;
- }
- intvec_t operator>>(intvec_t n) const
- {
- intvec_t res = *this;
- return res >>= n;
- }
- intvec_t operator<<(intvec_t n) const
- {
- intvec_t res = *this;
- return res <<= n;
- }
-
- intvec_t clz() const { return MF::vml_clz(*this); }
- intvec_t popcount() const { return MF::vml_popcount(*this); }
-
-
-
- boolvec_t operator==(intvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
- return res;
- }
- boolvec_t operator!=(intvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
- return res;
- }
- boolvec_t operator<(intvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d];
- return res;
- }
- boolvec_t operator<=(intvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d];
- return res;
- }
- boolvec_t operator>(intvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d];
- return res;
- }
- boolvec_t operator>=(intvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d];
- return res;
- }
-
- intvec_t abs() const { return MF::vml_abs(*this); }
- boolvec_t isignbit() const { return MF::vml_isignbit(*this); }
- intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); }
- intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); }
- };
-
-
-
- template<typename T, int N>
- struct realtestvec: floatprops<T>
- {
- typedef typename floatprops<T>::int_t int_t;
- typedef typename floatprops<T>::uint_t uint_t;
- typedef typename floatprops<T>::real_t real_t;
-
- static int const size = N;
- typedef real_t scalar_t;
- typedef real_t vector_t[size];
- static int const alignment = sizeof(real_t);
-
#ifndef VML_NO_IOSTREAM
- static char const* name()
- {
- static std::string name_;
- if (name_.empty()) {
- std::stringstream buf;
- buf << "<VML:" << N << "*" << FP::name() << ">";
- name_ = buf.str();
- }
- return name_.c_str();
+ static char const *name() {
+ static std::string name_;
+ if (name_.empty()) {
+ std::stringstream buf;
+ buf << "<VML:" << N << "*" << FP::name() << ">";
+ name_ = buf.str();
}
+ return name_.c_str();
+ }
#endif
- void barrier()
- {
+ void barrier() {
#if defined __GNUC__ && !defined __clang__ && !defined __ICC
- // GCC crashes when +X is used as constraint
-# if defined __SSE2__
- for (int d=0; d<size; ++d) __asm__("": "+x"(v[d]));
-# elif defined __PPC64__ // maybe also __PPC__
- for (int d=0; d<size; ++d) __asm__("": "+f"(v[d]));
-# elif defined __arm__
- for (int d=0; d<size; ++d) __asm__("": "+w"(v[d]));
-# else
-# error "Floating point barrier undefined on this architecture"
-# endif
+// GCC crashes when +X is used as constraint
+#if defined __SSE2__
+ for (int d = 0; d < size; ++d)
+ __asm__("" : "+x"(v[d]));
+#elif defined __PPC64__ // maybe also __PPC__
+ for (int d = 0; d < size; ++d)
+ __asm__("" : "+f"(v[d]));
+#elif defined __arm__
+ for (int d = 0; d < size; ++d)
+ __asm__("" : "+w"(v[d]));
+#else
+#error "Floating point barrier undefined on this architecture"
+#endif
#elif defined __clang__
- for (int d=0; d<size; ++d) __asm__("": "+X"(v[d]));
+ for (int d = 0; d < size; ++d)
+ __asm__("" : "+X"(v[d]));
#elif defined __ICC
- for (int d=0; d<size; ++d) {
- real_t tmp = v[d];
- __asm__("": "+X"(tmp));
- v[d] = tmp;
- }
+ for (int d = 0; d < size; ++d) {
+ real_t tmp = v[d];
+ __asm__("" : "+X"(tmp));
+ v[d] = tmp;
+ }
#elif defined __IBMCPP__
- for (int d=0; d<size; ++d) __asm__("": "+f"(v[d]));
+ for (int d = 0; d < size; ++d)
+ __asm__("" : "+f"(v[d]));
#else
-# error "Floating point barrier undefined on this architecture"
+#error "Floating point barrier undefined on this architecture"
#endif
- }
-
- typedef booltestvec<real_t, size> boolvec_t;
- typedef inttestvec<real_t, size> intvec_t;
- typedef realtestvec realvec_t;
-
- // short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realtestvec() {}
- // can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realtestvec(realtestvec const& x): v(x.v) {}
- // realtestvec& operator=(realtestvec const& x) { return v=x.v, *this; }
- //realtestvec(vector_t x): v(x) {}
- realtestvec(real_t a) { for (int d=0; d<size; ++d) v[d]=a; }
- realtestvec(real_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; }
-
- real_t operator[](int n) const { return v[n]; }
- realvec_t& set_elt(int n, real_t a) { return v[n]=a, *this; }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return loadu(p);
- }
- static realvec_t loadu(real_t const* p)
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = p[d];
- return res;
- }
- static realvec_t loadu(real_t const* p, size_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return loadu(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- return m.m.ifthen(loada(p), *this);
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- return m.m.ifthen(loadu(p), *this);
- }
- realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
- {
- return m.m.ifthen(loadu(p, ioff), *this);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storeu(p);
- }
- void storeu(real_t* p) const
- {
- for (int d=0; d<size; ++d) p[d] = v[d];
- }
- void storeu(real_t* p, size_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storeu(p+ioff);
- }
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storeu(p, m);
- }
- void storeu(real_t* p, mask_t const& m) const
- {
- for (int d=0; d<size; ++d) if (m.m[d]) p[d] = v[d];
- }
- void storeu(real_t* p, size_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- storeu(p+ioff, m);
- }
-
-
-
- intvec_t as_int() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = FP::as_int(v[d]);
- return res;
- }
- intvec_t convert_int() const { return MF::vml_convert_int(*this); }
-
-
-
- realvec_t operator+() const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = + v[d];
- return res;
- }
- realvec_t operator-() const
- {
- realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = - v[d];
- return res;
- }
-
- realvec_t& operator+=(realvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] += x.v[d];
- return *this;
- }
- realvec_t& operator-=(realvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] -= x.v[d];
- return *this;
- }
- realvec_t& operator*=(realvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] *= x.v[d];
- return *this;
- }
- realvec_t& operator/=(realvec_t const& x)
- {
- for (int d=0; d<size; ++d) v[d] /= x.v[d];
- return *this;
- }
-
- realvec_t operator+(realvec_t x) const
- {
- realvec_t res = *this;
- return res += x;
- }
- realvec_t operator-(realvec_t x) const
- {
- realvec_t res = *this;
- return res -= x;
- }
- realvec_t operator*(realvec_t x) const
- {
- realvec_t res = *this;
- return res *= x;
- }
- realvec_t operator/(realvec_t x) const
- {
- realvec_t res = *this;
- return res /= x;
- }
-
- real_t maxval() const
- {
- real_t res = v[0];
- for (int d=1; d<size; ++d) res = vml_std::fmax(res, v[d]);
- return res;
- }
- real_t minval() const
- {
- real_t res = v[0];
- for (int d=1; d<size; ++d) res = vml_std::fmin(res, v[d]);
- return res;
- }
- real_t prod() const
- {
- real_t res = v[0];
- for (int d=1; d<size; ++d) res *= v[d];
- return res;
- }
- real_t sum() const
- {
- real_t res = v[0];
- for (int d=1; d<size; ++d) res += v[d];
- return res;
- }
-
-
-
- boolvec_t operator==(realvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d];
- return res;
- }
- boolvec_t operator!=(realvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d];
- return res;
- }
- boolvec_t operator<(realvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d];
- return res;
- }
- boolvec_t operator<=(realvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d];
- return res;
- }
- boolvec_t operator>(realvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d];
- return res;
- }
- boolvec_t operator>=(realvec_t const& x) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d];
- return res;
- }
-
-
-
- realvec_t acos() const { return MF::vml_acos(*this); }
- realvec_t acosh() const { return MF::vml_acosh(*this); }
- realvec_t asin() const { return MF::vml_asin(*this); }
- realvec_t asinh() const { return MF::vml_asinh(*this); }
- realvec_t atan() const { return MF::vml_atan(*this); }
- realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
- realvec_t atanh() const { return MF::vml_atanh(*this); }
- realvec_t cbrt() const { return MF::vml_cbrt(*this); }
- realvec_t ceil() const { return MF::vml_ceil(*this); }
- realvec_t copysign(realvec_t y) const
- {
- return MF::vml_copysign(*this, y);
- }
- realvec_t cos() const { return MF::vml_cos(*this); }
- realvec_t cosh() const { return MF::vml_cosh(*this); }
- realvec_t exp() const { return MF::vml_exp(*this); }
- realvec_t exp10() const { return MF::vml_exp10(*this); }
- realvec_t exp2() const { return MF::vml_exp2(*this); }
- realvec_t expm1() const { return MF::vml_expm1(*this); }
- realvec_t fabs() const { return MF::vml_fabs(*this); }
- realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
- realvec_t floor() const { return MF::vml_floor(*this); }
- realvec_t fma(realvec_t y, realvec_t z) const
- {
- return MF::vml_fma(*this, y, z);
- }
- realvec_t fmax(realvec_t y) const { return MF::vml_fmax(*this, y); }
- realvec_t fmin(realvec_t y) const { return MF::vml_fmin(*this, y); }
- realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
- realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
- realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
- intvec_t ilogb() const { return MF::vml_ilogb(*this); }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const { return MF::vml_isnan(*this); }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
- realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
- realvec_t log() const { return MF::vml_log(*this); }
- realvec_t log10() const { return MF::vml_log10(*this); }
- realvec_t log1p() const { return MF::vml_log1p(*this); }
- realvec_t log2() const { return MF::vml_log2(*this); }
- intvec_t lrint() const { return MF::vml_lrint(*this); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return MF::vml_mad(*this, y, z);
- }
- realvec_t nextafter(realvec_t y) const
- {
- return MF::vml_nextafter(*this, y);
- }
- realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
- realvec_t rcp() const { return MF::vml_rcp(*this); }
- realvec_t remainder(realvec_t y) const
- {
- return MF::vml_remainder(*this, y);
- }
- realvec_t rint() const { return MF::vml_rint(*this); }
- realvec_t round() const { return MF::vml_round(*this); }
- realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
- boolvec_t signbit() const { return MF::vml_signbit(*this); }
- realvec_t sin() const { return MF::vml_sin(*this); }
- realvec_t sinh() const { return MF::vml_sinh(*this); }
- realvec_t sqrt() const { return MF::vml_sqrt(*this); }
- realvec_t tan() const { return MF::vml_tan(*this); }
- realvec_t tanh() const { return MF::vml_tanh(*this); }
- realvec_t trunc() const { return MF::vml_trunc(*this); }
- };
-
-
-
- // booltestvec definitions
-
- template<typename T, int N>
- inline
- typename booltestvec<T,N>::intvec_t
- booltestvec<T,N>::as_int() const
- {
- return convert_int();
- }
-
- template<typename T, int N>
- inline
- typename booltestvec<T,N>::intvec_t
- booltestvec<T,N>::convert_int() const
- {
- intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d];
- return res;
}
-
- template<typename T, int N>
- inline
- typename booltestvec<T,N>::boolvec_t
- booltestvec<T,N>::ifthen(boolvec_t x, boolvec_t y) const
- {
- boolvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+
+ typedef booltestvec<real_t, size> boolvec_t;
+ typedef inttestvec<real_t, size> intvec_t;
+ typedef realtestvec realvec_t;
+
+ // short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realtestvec() {}
+ // can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realtestvec(realtestvec const& x): v(x.v) {}
+ // realtestvec& operator=(realtestvec const& x) { return v=x.v, *this; }
+ // realtestvec(vector_t x): v(x) {}
+ realtestvec(real_t a) {
+ for (int d = 0; d < size; ++d)
+ v[d] = a;
+ }
+ realtestvec(real_t const *as) {
+ for (int d = 0; d < size; ++d)
+ v[d] = as[d];
+ }
+
+ real_t operator[](int n) const { return v[n]; }
+ realvec_t &set_elt(int n, real_t a) { return v[n] = a, *this; }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return loadu(p);
+ }
+ static realvec_t loadu(real_t const *p) {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = p[d];
return res;
}
-
- template<typename T, int N>
- inline
- typename booltestvec<T,N>::intvec_t
- booltestvec<T,N>::ifthen(intvec_t x, intvec_t y) const
- {
+ static realvec_t loadu(real_t const *p, size_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ return m.m.ifthen(loada(p), *this);
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ return m.m.ifthen(loadu(p), *this);
+ }
+ realvec_t loadu(real_t const *p, size_t ioff, mask_t const &m) const {
+ return m.m.ifthen(loadu(p, ioff), *this);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storeu(p);
+ }
+ void storeu(real_t *p) const {
+ for (int d = 0; d < size; ++d)
+ p[d] = v[d];
+ }
+ void storeu(real_t *p, size_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storeu(p, m);
+ }
+ void storeu(real_t *p, mask_t const &m) const {
+ for (int d = 0; d < size; ++d)
+ if (m.m[d])
+ p[d] = v[d];
+ }
+ void storeu(real_t *p, size_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const {
intvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+ for (int d = 0; d < size; ++d)
+ res.v[d] = FP::as_int(v[d]);
return res;
}
-
- template<typename T, int N>
- inline
- typename booltestvec<T,N>::realvec_t
- booltestvec<T,N>::ifthen(realvec_t x, realvec_t y) const
- {
+ intvec_t convert_int() const { return MF::vml_convert_int(*this); }
+
+ realvec_t operator+() const {
realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d];
+ for (int d = 0; d < size; ++d)
+ res.v[d] = +v[d];
return res;
}
-
-
-
- // inttestvec definitions
-
- template<typename T, int N>
- inline
- typename inttestvec<T,N>::realvec_t
- inttestvec<T,N>::as_float() const
- {
+ realvec_t operator-() const {
realvec_t res;
- for (int d=0; d<size; ++d) res.v[d] = FP::as_float(v[d]);
+ for (int d = 0; d < size; ++d)
+ res.v[d] = -v[d];
return res;
}
-
- template<typename T, int N>
- inline
- typename inttestvec<T,N>::realvec_t
- inttestvec<T,N>::convert_float() const
- {
- return MF::vml_convert_float(*this);
- }
-
-
-
- // Wrappers
-
- // booltestvec wrappers
-
- template<typename real_t, int size>
- inline
- inttestvec<real_t, size> as_int(booltestvec<real_t, size> x)
- {
- return x.as_int();
- }
-
- template<typename real_t, int size>
- inline
- inttestvec<real_t, size> convert_int(booltestvec<real_t, size> x)
- {
- return x.convert_int();
- }
-
- template<typename real_t, int size>
- inline bool all(booltestvec<real_t, size> x) { return x.all(); }
-
- template<typename real_t, int size>
- inline bool any(booltestvec<real_t, size> x) { return x.any(); }
-
- template<typename real_t, int size>
- inline
- booltestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
- booltestvec<real_t, size> x,
- booltestvec<real_t, size> y)
- {
- return c.ifthen(x, y);
- }
-
- template<typename real_t, int size>
- inline
- inttestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
- inttestvec<real_t, size> x,
- inttestvec<real_t, size> y)
- {
- return c.ifthen(x, y);
- }
-
- template<typename real_t, int size>
- inline
- realtestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
- realtestvec<real_t, size> x,
- realtestvec<real_t, size> y)
- {
- return c.ifthen(x, y);
- }
-
-
-
- // inttestvec wrappers
-
- template<typename real_t, int size>
- inline inttestvec<real_t, size> abs(inttestvec<real_t, size> x)
- {
- return x.abs();
- }
-
- template<typename real_t, int size>
- inline booltestvec<real_t, size> as_bool(inttestvec<real_t, size> x)
- {
- return x.as_bool();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> as_float(inttestvec<real_t, size> x)
- {
- return x.as_float();
- }
-
- template<typename real_t, int size>
- inline
- inttestvec<real_t, size> bitifthen(inttestvec<real_t, size> x,
- inttestvec<real_t, size> y,
- inttestvec<real_t, size> z)
- {
- return x.bitifthen(y, z);
- }
-
- template<typename real_t, int size>
- inline inttestvec<real_t, size> clz(inttestvec<real_t, size> x)
- {
- return x.clz();
- }
-
- template<typename real_t, int size>
- inline booltestvec<real_t, size> convert_bool(inttestvec<real_t, size> x)
- {
- return x.convert_bool();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> convert_float(inttestvec<real_t, size> x)
- {
- return x.convert_float();
- }
-
- template<typename real_t, int size>
- inline booltestvec<real_t, size> isignbit(inttestvec<real_t, size> x)
- {
- return x.isignbit();
- }
-
- template<typename real_t, int size>
- inline
- inttestvec<real_t, size> lsr(inttestvec<real_t, size> x,
- typename inttestvec<real_t, size>::int_t n)
- {
- return x.lsr(n);
- }
-
- template<typename real_t, int size>
- inline
- inttestvec<real_t, size> lsr(inttestvec<real_t, size> x,
- inttestvec<real_t, size> n)
- {
- return x.lsr(n);
- }
-
- template<typename real_t, int size>
- inline
- inttestvec<real_t, size> max(inttestvec<real_t, size> x,
- inttestvec<real_t, size> y)
- {
- return x.max(y);
- }
-
- template<typename real_t, int size>
- inline
- inttestvec<real_t, size> min(inttestvec<real_t, size> x,
- inttestvec<real_t, size> y)
- {
- return x.min(y);
- }
-
- template<typename real_t, int size>
- inline
- inttestvec<real_t, size> popcount(inttestvec<real_t, size> x)
- {
- return x.popcount();
- }
-
- template<typename real_t, int size>
- inline
- inttestvec<real_t, size> rotate(inttestvec<real_t, size> x,
- typename inttestvec<real_t, size>::int_t n)
- {
- return x.rotate(n);
- }
-
- template<typename real_t, int size>
- inline
- inttestvec<real_t, size> rotate(inttestvec<real_t, size> x,
- inttestvec<real_t, size> n)
- {
- return x.rotate(n);
- }
-
-
-
- // realtestvec wrappers
-
- template<typename real_t, int size>
- inline
- realtestvec<real_t, size>
- loada(real_t const* p,
- realtestvec<real_t, size> x,
- typename realtestvec<real_t, size>::mask_t const& m)
- {
- return x.loada(p, m);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size>
- loadu(real_t const* p,
- realtestvec<real_t, size> x,
- typename realtestvec<real_t, size>::mask_t const& m)
- {
- return x.loadu(p, m);
- }
-
- template<typename real_t, int size>
- inline
- realtestvec<real_t, size>
- loadu(real_t const* p, size_t ioff,
- realtestvec<real_t, size> x,
- typename realtestvec<real_t, size>::mask_t const& m)
- {
- return x.loadu(p, ioff, m);
- }
-
- template<typename real_t, int size>
- inline void storea(realtestvec<real_t, size> x, real_t* p)
- {
- return x.storea(p);
- }
-
- template<typename real_t, int size>
- inline void storeu(realtestvec<real_t, size> x, real_t* p)
- {
- return x.storeu(p);
- }
-
- template<typename real_t, int size>
- inline void storeu(realtestvec<real_t, size> x, real_t* p, size_t ioff)
- {
- return x.storeu(p, ioff);
- }
-
- template<typename real_t, int size>
- inline void storea(realtestvec<real_t, size> x, real_t* p,
- typename realtestvec<real_t, size>::mask_t const& m)
- {
- return x.storea(p, m);
- }
-
- template<typename real_t, int size>
- inline void storeu(realtestvec<real_t, size> x, real_t* p,
- typename realtestvec<real_t, size>::mask_t const& m)
- {
- return x.storeu(p, m);
- }
-
- template<typename real_t, int size>
- inline void storeu(realtestvec<real_t, size> x, real_t* p, size_t ioff,
- typename realtestvec<real_t, size>::mask_t const& m)
- {
- return x.storeu(p, ioff, m);
- }
-
-
-
- template<typename real_t, int size>
- inline inttestvec<real_t, size> as_int(realtestvec<real_t, size> x)
- {
- return x.as_int();
- }
-
- template<typename real_t, int size>
- inline inttestvec<real_t, size> convert_int(realtestvec<real_t, size> x)
- {
- return x.convert_int();
- }
-
- template<typename real_t, int size>
- inline real_t maxval(realtestvec<real_t, size> x)
- {
- return x.maxval();
- }
-
- template<typename real_t, int size>
- inline real_t minval(realtestvec<real_t, size> x)
- {
- return x.minval();
- }
-
- template<typename real_t, int size>
- inline real_t prod(realtestvec<real_t, size> x)
- {
- return x.prod();
- }
-
- template<typename real_t, int size>
- inline real_t sum(realtestvec<real_t, size> x)
- {
- return x.sum();
- }
-
-
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> acos(realtestvec<real_t, size> x)
- {
- return x.acos();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> acosh(realtestvec<real_t, size> x)
- {
- return x.acosh();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> asin(realtestvec<real_t, size> x)
- {
- return x.asin();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> asinh(realtestvec<real_t, size> x)
- {
- return x.asinh();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> atan(realtestvec<real_t, size> x)
- {
- return x.atan();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> atan2(realtestvec<real_t, size> x,
- realtestvec<real_t, size> y)
- {
- return x.atan2(y);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> atanh(realtestvec<real_t, size> x)
- {
- return x.atanh();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> cbrt(realtestvec<real_t, size> x)
- {
- return x.cbrt();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> ceil(realtestvec<real_t, size> x)
- {
- return x.ceil();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> copysign(realtestvec<real_t, size> x,
- realtestvec<real_t, size> y)
- {
- return x.copysign(y);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> cos(realtestvec<real_t, size> x)
- {
- return x.cos();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> cosh(realtestvec<real_t, size> x)
- {
- return x.cosh();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> exp(realtestvec<real_t, size> x)
- {
- return x.exp();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> exp10(realtestvec<real_t, size> x)
- {
- return x.exp10();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> exp2(realtestvec<real_t, size> x)
- {
- return x.exp2();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> expm1(realtestvec<real_t, size> x)
- {
- return x.expm1();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> fabs(realtestvec<real_t, size> x)
- {
- return x.fabs();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> floor(realtestvec<real_t, size> x)
- {
- return x.floor();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> fdim(realtestvec<real_t, size> x,
- realtestvec<real_t, size> y)
- {
- return x.fdim(y);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> fma(realtestvec<real_t, size> x,
- realtestvec<real_t, size> y,
- realtestvec<real_t, size> z)
- {
- return x.fma(y, z);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> fmax(realtestvec<real_t, size> x,
- realtestvec<real_t, size> y)
- {
- return x.fmax(y);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> fmin(realtestvec<real_t, size> x,
- realtestvec<real_t, size> y)
- {
- return x.fmin(y);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> fmod(realtestvec<real_t, size> x,
- realtestvec<real_t, size> y)
- {
- return x.fmod(y);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> frexp(realtestvec<real_t, size> x,
- inttestvec<real_t, size>* r)
- {
- return x.frexp(r);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> hypot(realtestvec<real_t, size> x,
- realtestvec<real_t, size> y)
- {
- return x.hypot(y);
- }
-
- template<typename real_t, int size>
- inline inttestvec<real_t, size> ilogb(realtestvec<real_t, size> x)
- {
- return x.ilogb();
- }
-
- template<typename real_t, int size>
- inline booltestvec<real_t, size> isfinite(realtestvec<real_t, size> x)
- {
- return x.isfinite();
- }
-
- template<typename real_t, int size>
- inline booltestvec<real_t, size> isinf(realtestvec<real_t, size> x)
- {
- return x.isinf();
- }
-
- template<typename real_t, int size>
- inline booltestvec<real_t, size> isnan(realtestvec<real_t, size> x)
- {
- return x.isnan();
- }
-
- template<typename real_t, int size>
- inline booltestvec<real_t, size> isnormal(realtestvec<real_t, size> x)
- {
- return x.isnormal();
- }
-
- template<typename real_t, int size>
- inline
- realtestvec<real_t, size> ldexp(realtestvec<real_t, size> x,
- typename inttestvec<real_t, size>::int_t n)
- {
- return x.ldexp(n);
- }
-
- template<typename real_t, int size>
- inline
- realtestvec<real_t, size> ldexp(realtestvec<real_t, size> x,
- inttestvec<real_t, size> n)
- {
- return x.ldexp(n);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> log(realtestvec<real_t, size> x)
- {
- return x.log();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> log10(realtestvec<real_t, size> x)
- {
- return x.log10();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> log1p(realtestvec<real_t, size> x)
- {
- return x.log1p();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> log2(realtestvec<real_t, size> x)
- {
- return x.log2();
- }
-
- template<typename real_t, int size>
- inline inttestvec<real_t, size> lrint(realtestvec<real_t, size> x)
- {
- return x.lrint();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> mad(realtestvec<real_t, size> x,
- realtestvec<real_t, size> y,
- realtestvec<real_t, size> z)
- {
- return x.mad(y, z);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> nextafter(realtestvec<real_t, size> x,
- realtestvec<real_t, size> y)
- {
- return x.nextafter(y);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> pow(realtestvec<real_t, size> x,
- realtestvec<real_t, size> y)
- {
- return x.pow(y);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> rcp(realtestvec<real_t, size> x)
- {
- return x.rcp();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> remainder(realtestvec<real_t, size> x,
- realtestvec<real_t, size> y)
- {
- return x.remainder(y);
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> rint(realtestvec<real_t, size> x)
- {
- return x.rint();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> round(realtestvec<real_t, size> x)
- {
- return x.round();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> rsqrt(realtestvec<real_t, size> x)
- {
- return x.rsqrt();
- }
-
- template<typename real_t, int size>
- inline booltestvec<real_t, size> signbit(realtestvec<real_t, size> x)
- {
- return x.signbit();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> sin(realtestvec<real_t, size> x)
- {
- return x.sin();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> sinh(realtestvec<real_t, size> x)
- {
- return x.sinh();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> sqrt(realtestvec<real_t, size> x)
- {
- return x.sqrt();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> tan(realtestvec<real_t, size> x)
- {
- return x.tan();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> tanh(realtestvec<real_t, size> x)
- {
- return x.tanh();
- }
-
- template<typename real_t, int size>
- inline realtestvec<real_t, size> trunc(realtestvec<real_t, size> x)
- {
- return x.trunc();
- }
-
-
-
-#ifndef VML_NO_IOSTREAM
- template<typename real_t, int size>
- std::ostream& operator<<(std::ostream& os,
- booltestvec<real_t, size> const& x)
- {
- os << "[";
- for (int i=0; i<size; ++i) {
- if (i!=0) os << ",";
- os << x[i];
- }
- os << "]";
- return os;
- }
-
- template<typename real_t, int size>
- std::ostream& operator<<(std::ostream& os,
- inttestvec<real_t, size> const& x)
- {
- os << "[";
- for (int i=0; i<size; ++i) {
- if (i!=0) os << ",";
- os << x[i];
- }
- os << "]";
- return os;
- }
-
- template<typename real_t, int size>
- std::ostream& operator<<(std::ostream& os,
- realtestvec<real_t, size> const& x)
- {
- os << "[";
- for (int i=0; i<size; ++i) {
- if (i!=0) os << ",";
- os << x[i];
- }
- os << "]";
- return os;
+
+ realvec_t &operator+=(realvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] += x.v[d];
+ return *this;
+ }
+ realvec_t &operator-=(realvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] -= x.v[d];
+ return *this;
+ }
+ realvec_t &operator*=(realvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] *= x.v[d];
+ return *this;
+ }
+ realvec_t &operator/=(realvec_t const &x) {
+ for (int d = 0; d < size; ++d)
+ v[d] /= x.v[d];
+ return *this;
}
+
+ realvec_t operator+(realvec_t x) const {
+ realvec_t res = *this;
+ return res += x;
+ }
+ realvec_t operator-(realvec_t x) const {
+ realvec_t res = *this;
+ return res -= x;
+ }
+ realvec_t operator*(realvec_t x) const {
+ realvec_t res = *this;
+ return res *= x;
+ }
+ realvec_t operator/(realvec_t x) const {
+ realvec_t res = *this;
+ return res /= x;
+ }
+
+ real_t maxval() const {
+ real_t res = v[0];
+ for (int d = 1; d < size; ++d)
+ res = vml_std::fmax(res, v[d]);
+ return res;
+ }
+ real_t minval() const {
+ real_t res = v[0];
+ for (int d = 1; d < size; ++d)
+ res = vml_std::fmin(res, v[d]);
+ return res;
+ }
+ real_t prod() const {
+ real_t res = v[0];
+ for (int d = 1; d < size; ++d)
+ res *= v[d];
+ return res;
+ }
+ real_t sum() const {
+ real_t res = v[0];
+ for (int d = 1; d < size; ++d)
+ res += v[d];
+ return res;
+ }
+
+ boolvec_t operator==(realvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] == x.v[d];
+ return res;
+ }
+ boolvec_t operator!=(realvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] != x.v[d];
+ return res;
+ }
+ boolvec_t operator<(realvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] < x.v[d];
+ return res;
+ }
+ boolvec_t operator<=(realvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] <= x.v[d];
+ return res;
+ }
+ boolvec_t operator>(realvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] > x.v[d];
+ return res;
+ }
+ boolvec_t operator>=(realvec_t const &x) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] >= x.v[d];
+ return res;
+ }
+
+ realvec_t acos() const { return MF::vml_acos(*this); }
+ realvec_t acosh() const { return MF::vml_acosh(*this); }
+ realvec_t asin() const { return MF::vml_asin(*this); }
+ realvec_t asinh() const { return MF::vml_asinh(*this); }
+ realvec_t atan() const { return MF::vml_atan(*this); }
+ realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); }
+ realvec_t atanh() const { return MF::vml_atanh(*this); }
+ realvec_t cbrt() const { return MF::vml_cbrt(*this); }
+ realvec_t ceil() const { return MF::vml_ceil(*this); }
+ realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); }
+ realvec_t cos() const { return MF::vml_cos(*this); }
+ realvec_t cosh() const { return MF::vml_cosh(*this); }
+ realvec_t exp() const { return MF::vml_exp(*this); }
+ realvec_t exp10() const { return MF::vml_exp10(*this); }
+ realvec_t exp2() const { return MF::vml_exp2(*this); }
+ realvec_t expm1() const { return MF::vml_expm1(*this); }
+ realvec_t fabs() const { return MF::vml_fabs(*this); }
+ realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
+ realvec_t floor() const { return MF::vml_floor(*this); }
+ realvec_t fma(realvec_t y, realvec_t z) const {
+ return MF::vml_fma(*this, y, z);
+ }
+ realvec_t fmax(realvec_t y) const { return MF::vml_fmax(*this, y); }
+ realvec_t fmin(realvec_t y) const { return MF::vml_fmin(*this, y); }
+ realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
+ realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+ realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const { return MF::vml_isnan(*this); }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec_t log() const { return MF::vml_log(*this); }
+ realvec_t log10() const { return MF::vml_log10(*this); }
+ realvec_t log1p() const { return MF::vml_log1p(*this); }
+ realvec_t log2() const { return MF::vml_log2(*this); }
+ intvec_t lrint() const { return MF::vml_lrint(*this); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return MF::vml_mad(*this, y, z);
+ }
+ realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); }
+ realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); }
+ realvec_t rcp() const { return MF::vml_rcp(*this); }
+ realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); }
+ realvec_t rint() const { return MF::vml_rint(*this); }
+ realvec_t round() const { return MF::vml_round(*this); }
+ realvec_t rsqrt() const { return MF::vml_rsqrt(*this); }
+ boolvec_t signbit() const { return MF::vml_signbit(*this); }
+ realvec_t sin() const { return MF::vml_sin(*this); }
+ realvec_t sinh() const { return MF::vml_sinh(*this); }
+ realvec_t sqrt() const { return MF::vml_sqrt(*this); }
+ realvec_t tan() const { return MF::vml_tan(*this); }
+ realvec_t tanh() const { return MF::vml_tanh(*this); }
+ realvec_t trunc() const { return MF::vml_trunc(*this); }
+};
+
+// booltestvec definitions
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::intvec_t booltestvec<T, N>::as_int() const {
+ return convert_int();
+}
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::intvec_t
+booltestvec<T, N>::convert_int() const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d];
+ return res;
+}
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::boolvec_t
+booltestvec<T, N>::ifthen(boolvec_t x, boolvec_t y) const {
+ boolvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] ? x.v[d] : y.v[d];
+ return res;
+}
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::intvec_t
+booltestvec<T, N>::ifthen(intvec_t x, intvec_t y) const {
+ intvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] ? x.v[d] : y.v[d];
+ return res;
+}
+
+template <typename T, int N>
+inline typename booltestvec<T, N>::realvec_t
+booltestvec<T, N>::ifthen(realvec_t x, realvec_t y) const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = v[d] ? x.v[d] : y.v[d];
+ return res;
+}
+
+// inttestvec definitions
+
+template <typename T, int N>
+inline typename inttestvec<T, N>::realvec_t inttestvec<T, N>::as_float() const {
+ realvec_t res;
+ for (int d = 0; d < size; ++d)
+ res.v[d] = FP::as_float(v[d]);
+ return res;
+}
+
+template <typename T, int N>
+inline typename inttestvec<T, N>::realvec_t
+inttestvec<T, N>::convert_float() const {
+ return MF::vml_convert_float(*this);
+}
+
+// Wrappers
+
+// booltestvec wrappers
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> as_int(booltestvec<real_t, size> x) {
+ return x.as_int();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> convert_int(booltestvec<real_t, size> x) {
+ return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline bool all(booltestvec<real_t, size> x) {
+ return x.all();
+}
+
+template <typename real_t, int size>
+inline bool any(booltestvec<real_t, size> x) {
+ return x.any();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
+ booltestvec<real_t, size> x,
+ booltestvec<real_t, size> y) {
+ return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
+ inttestvec<real_t, size> x,
+ inttestvec<real_t, size> y) {
+ return c.ifthen(x, y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> ifthen(booltestvec<real_t, size> c,
+ realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y) {
+ return c.ifthen(x, y);
+}
+
+// inttestvec wrappers
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> abs(inttestvec<real_t, size> x) {
+ return x.abs();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> as_bool(inttestvec<real_t, size> x) {
+ return x.as_bool();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> as_float(inttestvec<real_t, size> x) {
+ return x.as_float();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> bitifthen(inttestvec<real_t, size> x,
+ inttestvec<real_t, size> y,
+ inttestvec<real_t, size> z) {
+ return x.bitifthen(y, z);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> clz(inttestvec<real_t, size> x) {
+ return x.clz();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> convert_bool(inttestvec<real_t, size> x) {
+ return x.convert_bool();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> convert_float(inttestvec<real_t, size> x) {
+ return x.convert_float();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isignbit(inttestvec<real_t, size> x) {
+ return x.isignbit();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size>
+lsr(inttestvec<real_t, size> x, typename inttestvec<real_t, size>::int_t n) {
+ return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> lsr(inttestvec<real_t, size> x,
+ inttestvec<real_t, size> n) {
+ return x.lsr(n);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> max(inttestvec<real_t, size> x,
+ inttestvec<real_t, size> y) {
+ return x.max(y);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> min(inttestvec<real_t, size> x,
+ inttestvec<real_t, size> y) {
+ return x.min(y);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> popcount(inttestvec<real_t, size> x) {
+ return x.popcount();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size>
+rotate(inttestvec<real_t, size> x, typename inttestvec<real_t, size>::int_t n) {
+ return x.rotate(n);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> rotate(inttestvec<real_t, size> x,
+ inttestvec<real_t, size> n) {
+ return x.rotate(n);
+}
+
+// realtestvec wrappers
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size>
+loada(real_t const *p, realtestvec<real_t, size> x,
+ typename realtestvec<real_t, size>::mask_t const &m) {
+ return x.loada(p, m);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size>
+loadu(real_t const *p, realtestvec<real_t, size> x,
+ typename realtestvec<real_t, size>::mask_t const &m) {
+ return x.loadu(p, m);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size>
+loadu(real_t const *p, size_t ioff, realtestvec<real_t, size> x,
+ typename realtestvec<real_t, size>::mask_t const &m) {
+ return x.loadu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline void storea(realtestvec<real_t, size> x, real_t *p) {
+ return x.storea(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realtestvec<real_t, size> x, real_t *p) {
+ return x.storeu(p);
+}
+
+template <typename real_t, int size>
+inline void storeu(realtestvec<real_t, size> x, real_t *p, size_t ioff) {
+ return x.storeu(p, ioff);
+}
+
+template <typename real_t, int size>
+inline void storea(realtestvec<real_t, size> x, real_t *p,
+ typename realtestvec<real_t, size>::mask_t const &m) {
+ return x.storea(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realtestvec<real_t, size> x, real_t *p,
+ typename realtestvec<real_t, size>::mask_t const &m) {
+ return x.storeu(p, m);
+}
+
+template <typename real_t, int size>
+inline void storeu(realtestvec<real_t, size> x, real_t *p, size_t ioff,
+ typename realtestvec<real_t, size>::mask_t const &m) {
+ return x.storeu(p, ioff, m);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> as_int(realtestvec<real_t, size> x) {
+ return x.as_int();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> convert_int(realtestvec<real_t, size> x) {
+ return x.convert_int();
+}
+
+template <typename real_t, int size>
+inline real_t maxval(realtestvec<real_t, size> x) {
+ return x.maxval();
+}
+
+template <typename real_t, int size>
+inline real_t minval(realtestvec<real_t, size> x) {
+ return x.minval();
+}
+
+template <typename real_t, int size>
+inline real_t prod(realtestvec<real_t, size> x) {
+ return x.prod();
+}
+
+template <typename real_t, int size>
+inline real_t sum(realtestvec<real_t, size> x) {
+ return x.sum();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> acos(realtestvec<real_t, size> x) {
+ return x.acos();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> acosh(realtestvec<real_t, size> x) {
+ return x.acosh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> asin(realtestvec<real_t, size> x) {
+ return x.asin();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> asinh(realtestvec<real_t, size> x) {
+ return x.asinh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> atan(realtestvec<real_t, size> x) {
+ return x.atan();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> atan2(realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y) {
+ return x.atan2(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> atanh(realtestvec<real_t, size> x) {
+ return x.atanh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> cbrt(realtestvec<real_t, size> x) {
+ return x.cbrt();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> ceil(realtestvec<real_t, size> x) {
+ return x.ceil();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> copysign(realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y) {
+ return x.copysign(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> cos(realtestvec<real_t, size> x) {
+ return x.cos();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> cosh(realtestvec<real_t, size> x) {
+ return x.cosh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> exp(realtestvec<real_t, size> x) {
+ return x.exp();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> exp10(realtestvec<real_t, size> x) {
+ return x.exp10();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> exp2(realtestvec<real_t, size> x) {
+ return x.exp2();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> expm1(realtestvec<real_t, size> x) {
+ return x.expm1();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fabs(realtestvec<real_t, size> x) {
+ return x.fabs();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> floor(realtestvec<real_t, size> x) {
+ return x.floor();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fdim(realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y) {
+ return x.fdim(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fma(realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y,
+ realtestvec<real_t, size> z) {
+ return x.fma(y, z);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fmax(realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y) {
+ return x.fmax(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fmin(realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y) {
+ return x.fmin(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> fmod(realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y) {
+ return x.fmod(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> frexp(realtestvec<real_t, size> x,
+ inttestvec<real_t, size> *r) {
+ return x.frexp(r);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> hypot(realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y) {
+ return x.hypot(y);
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> ilogb(realtestvec<real_t, size> x) {
+ return x.ilogb();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isfinite(realtestvec<real_t, size> x) {
+ return x.isfinite();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isinf(realtestvec<real_t, size> x) {
+ return x.isinf();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isnan(realtestvec<real_t, size> x) {
+ return x.isnan();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> isnormal(realtestvec<real_t, size> x) {
+ return x.isnormal();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size>
+ldexp(realtestvec<real_t, size> x, typename inttestvec<real_t, size>::int_t n) {
+ return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> ldexp(realtestvec<real_t, size> x,
+ inttestvec<real_t, size> n) {
+ return x.ldexp(n);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> log(realtestvec<real_t, size> x) {
+ return x.log();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> log10(realtestvec<real_t, size> x) {
+ return x.log10();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> log1p(realtestvec<real_t, size> x) {
+ return x.log1p();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> log2(realtestvec<real_t, size> x) {
+ return x.log2();
+}
+
+template <typename real_t, int size>
+inline inttestvec<real_t, size> lrint(realtestvec<real_t, size> x) {
+ return x.lrint();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> mad(realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y,
+ realtestvec<real_t, size> z) {
+ return x.mad(y, z);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> nextafter(realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y) {
+ return x.nextafter(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> pow(realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y) {
+ return x.pow(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> rcp(realtestvec<real_t, size> x) {
+ return x.rcp();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> remainder(realtestvec<real_t, size> x,
+ realtestvec<real_t, size> y) {
+ return x.remainder(y);
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> rint(realtestvec<real_t, size> x) {
+ return x.rint();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> round(realtestvec<real_t, size> x) {
+ return x.round();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> rsqrt(realtestvec<real_t, size> x) {
+ return x.rsqrt();
+}
+
+template <typename real_t, int size>
+inline booltestvec<real_t, size> signbit(realtestvec<real_t, size> x) {
+ return x.signbit();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> sin(realtestvec<real_t, size> x) {
+ return x.sin();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> sinh(realtestvec<real_t, size> x) {
+ return x.sinh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> sqrt(realtestvec<real_t, size> x) {
+ return x.sqrt();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> tan(realtestvec<real_t, size> x) {
+ return x.tan();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> tanh(realtestvec<real_t, size> x) {
+ return x.tanh();
+}
+
+template <typename real_t, int size>
+inline realtestvec<real_t, size> trunc(realtestvec<real_t, size> x) {
+ return x.trunc();
+}
+
+#ifndef VML_NO_IOSTREAM
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, booltestvec<real_t, size> const &x) {
+ os << "[";
+ for (int i = 0; i < size; ++i) {
+ if (i != 0)
+ os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, inttestvec<real_t, size> const &x) {
+ os << "[";
+ for (int i = 0; i < size; ++i) {
+ if (i != 0)
+ os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+}
+
+template <typename real_t, int size>
+std::ostream &operator<<(std::ostream &os, realtestvec<real_t, size> const &x) {
+ os << "[";
+ for (int i = 0; i < size; ++i) {
+ if (i != 0)
+ os << ",";
+ os << x[i];
+ }
+ os << "]";
+ return os;
+}
#endif
-
+
} // namespace vecmathlib
-#endif // #ifndef VEC_TEST_H
+#endif // #ifndef VEC_TEST_H
diff --git a/vec_vsx_double2.h b/vec_vsx_double2.h
index 6725859..fa43a6f 100644
--- a/vec_vsx_double2.h
+++ b/vec_vsx_double2.h
@@ -13,679 +13,572 @@
#include <altivec.h>
#if defined __clang__
-# define __vector vector
-# define __pixel pixel
-# define __bool bool
+#define __vector vector
+#define __pixel pixel
+#define __bool bool
#elif defined __gcc__
-# undef vector
-# undef pixel
-# undef bool
+#undef vector
+#undef pixel
+#undef bool
#elif defined __xlC__
-# define __bool bool
+#define __bool bool
#else
-# error "Unknown compiler"
+#error "Unknown compiler"
#endif
-
-
namespace vecmathlib {
-
+
#define VECMATHLIB_HAVE_VEC_DOUBLE_2
- template<> struct boolvec<double,2>;
- template<> struct intvec<double,2>;
- template<> struct realvec<double,2>;
-
-
-
- template<>
- struct boolvec<double,2>: floatprops<double>
- {
- static int const size = 2;
- typedef bool scalar_t;
- typedef __vector __bool long long bvector_t;
- static int const alignment = sizeof(bvector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(bvector_t),
- "vector size is wrong");
-
- private:
- // true values are -1, false values are 0
- // truth values are interpreted bit-wise
- static uint_t from_bool(bool a) { return -int_t(a); }
- static bool to_bool(uint_t a) { return a; }
- public:
-
- typedef boolvec boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- bvector_t v;
-
- boolvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // boolvec(boolvec const& x): v(x.v) {}
- // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
- boolvec(bvector_t x): v(x) {}
- boolvec(bool a): v((bvector_t)vec_splats((unsigned long long)from_bool(a))) {}
- boolvec(bool const* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
-
- operator bvector_t() const { return v; }
- bool operator[](int n) const
- {
- return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n));
- }
- boolvec& set_elt(int n, bool a)
- {
- return
- vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this;
- }
-
-
-
- intvec_t as_int() const; // defined after intvec
- intvec_t convert_int() const; // defined after intvec
-
-
-
- boolvec operator!() const { return vec_nor(v, v); }
-
- boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
- boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
- boolvec operator==(boolvec x) const { return !(*this!=x); }
- boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
-
- bool all() const { return vec_all_ne(v, BV(false)); }
- bool any() const { return vec_any_ne(v, BV(false)); }
-
-
-
- // ifthen(condition, then-value, else-value)
- boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
- intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
- realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
- };
-
-
-
- template<>
- struct intvec<double,2>: floatprops<double>
- {
- static int const size = 2;
- typedef int_t scalar_t;
- typedef __vector signed long long ivector_t;
- static int const alignment = sizeof(ivector_t);
-
- static_assert(size * sizeof(real_t) == sizeof(ivector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec intvec_t;
- typedef realvec<real_t, size> realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- ivector_t v;
-
- intvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // intvec(intvec const& x): v(x.v) {}
- // intvec& operator=(intvec const& x) { return v=x.v, *this; }
- intvec(ivector_t x): v(x) {}
- intvec(int_t a): v(vec_splats((long long)a)) {}
- intvec(int_t const* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
- static intvec iota() { return (__vector signed long long){0, 1}; }
-
- operator ivector_t() const { return v; }
- int_t operator[](int n) const
- {
- return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n);
- }
- intvec_t& set_elt(int n, int_t a)
- {
- return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this;
- }
-
-
-
- // Vector casts do not change the bit battern
- boolvec_t as_bool() const { return (__vector __bool long long)v; }
- boolvec_t convert_bool() const { return *this != IV(I(0)); }
- realvec_t as_float() const; // defined after realvec
- realvec_t convert_float() const; // defined after realvec
-
-
-
- // Permutation control words
- private:
- // 0123 4567 -> 1436
- // exchange pairs
- static __vector unsigned char perm_int_swap()
- {
- return
- (__vector unsigned char)
- {4,5,6,7, 16,17,18,19, 12,13,14,15, 24,25,26,27};
- }
- // 0123 4567 -> 0426
- // broadcast high elements of pairs
- static __vector unsigned char perm_int_bchi()
- {
- return
- (__vector unsigned char)
- {0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
- }
- public:
-
-
-
- intvec operator+() const { return *this; }
- intvec operator-() const { return vec_neg(v); }
-
- intvec operator+(intvec x) const { return vec_add(v, x.v); }
- intvec operator-(intvec x) const { return vec_sub(v, x.v); }
- intvec operator*(intvec x) const { return vec_mul(v, x.v); }
- intvec operator/(intvec x) const { return vec_div(v, x.v); }
- intvec operator%(intvec x) const { return *this - *this / x * x; }
-
- intvec& operator+=(intvec const& x) { return *this=*this+x; }
- intvec& operator-=(intvec const& x) { return *this=*this-x; }
- intvec& operator*=(intvec const& x) { return *this=*this*x; }
- intvec& operator/=(intvec const& x) { return *this=*this/x; }
- intvec& operator%=(intvec const& x) { return *this=*this%x; }
-
-
-
- intvec operator~() const
- {
- return (__vector signed long long)vec_nor((__vector signed int)v, (__vector signed int)v);
- }
-
- intvec operator&(intvec x) const
- {
- return (__vector signed long long)vec_and((__vector signed int)v, (__vector signed int)x.v);
- }
- intvec operator|(intvec x) const
- {
- return (__vector signed long long)vec_or ((__vector signed int)v, (__vector signed int)x.v);
- }
- intvec operator^(intvec x) const
- {
- return (__vector signed long long)vec_xor((__vector signed int)v, (__vector signed int)x.v);
- }
-
- intvec& operator&=(intvec const& x) { return *this=*this&x; }
- intvec& operator|=(intvec const& x) { return *this=*this|x; }
- intvec& operator^=(intvec const& x) { return *this=*this^x; }
-
- intvec_t bitifthen(intvec_t x, intvec_t y) const;
-
-
-
- intvec lsr(int_t n) const { return lsr(IV(n)); }
- intvec_t rotate(int_t n) const;
- intvec operator>>(int_t n) const { return *this >> IV(n); }
- intvec operator<<(int_t n) const { return *this << IV(n); }
- intvec& operator>>=(int_t n) { return *this=*this>>n; }
- intvec& operator<<=(int_t n) { return *this=*this<<n; }
-
- intvec lsr(intvec n) const
- {
- // return vec_sr(v, (__vector unsigned long long)n.v);
- intvec r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, U((*this)[i]) >> U(n[i]));
- }
- return r;
- }
- intvec_t rotate(intvec_t n) const;
- intvec operator>>(intvec n) const
- {
- // return vec_sra(v, (__vector unsigned long long)n.v);
- intvec r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] >> n[i]);
- }
- return r;
- }
- intvec operator<<(intvec n) const
- {
- // return vec_sl(v, (__vector unsigned long long)n.v);
- intvec r;
- for (int i=0; i<size; ++i) {
- r.set_elt(i, (*this)[i] << n[i]);
- }
- return r;
- }
- intvec& operator>>=(intvec n) { return *this=*this>>n; }
- intvec& operator<<=(intvec n) { return *this=*this<<n; }
-
- intvec_t clz() const;
- intvec_t popcount() const;
-
-
-
- boolvec_t operator==(intvec const& x) const
- {
- // return vec_cmpeq(v, x.v);
- __vector signed int a = (__vector signed int)v;
- __vector signed int b = (__vector signed int)x.v;
- __vector __bool int c = vec_cmpeq(a, b);
- __vector __bool int cx = vec_perm(c, c, perm_int_swap());
- __vector __bool int r = vec_and(c, cx);
- return (__vector __bool long long)r;
- }
- boolvec_t operator!=(intvec const& x) const { return !(*this == x); }
- boolvec_t operator<(intvec const& x) const
- {
- __vector signed int a = (__vector signed int)v;
- __vector signed int b = (__vector signed int)x.v;
- __vector __bool int lt = vec_cmplt(a, b);
- __vector __bool int eq = vec_cmpeq(a, b);
- __vector unsigned int ua = (__vector unsigned int)v;
- __vector unsigned int ub = (__vector unsigned int)x.v;
- __vector __bool int ult = vec_cmplt(ua, ub);
- __vector __bool int ultx = vec_perm(ult, ult, perm_int_swap());
- __vector __bool int r = vec_or(lt, vec_and(eq, ultx));
- r = vec_perm(r, r, perm_int_bchi());
- return (__vector __bool long long)r;
- }
- boolvec_t operator<=(intvec const& x) const
- {
- return ! (*this > x);
- }
- boolvec_t operator>(intvec const& x) const
- {
- return x < *this;
- }
- boolvec_t operator>=(intvec const& x) const
- {
- return ! (*this < x);
- }
-
- intvec_t abs() const;
- boolvec_t isignbit() const { return (*this >> (bits-1)).as_bool(); }
- intvec_t max(intvec_t x) const;
- intvec_t min(intvec_t x) const;
- };
-
-
-
- template<>
- struct realvec<double,2>: floatprops<double>
- {
- static int const size = 2;
- typedef real_t scalar_t;
- typedef __vector double vector_t;
- static int const alignment = sizeof(vector_t);
-
- static char const* name() { return "<VSX:2*double>"; }
- void barrier() { __asm__("": "+v"(v)); }
-
- static_assert(size * sizeof(real_t) == sizeof(vector_t),
- "vector size is wrong");
-
- typedef boolvec<real_t, size> boolvec_t;
- typedef intvec<real_t, size> intvec_t;
- typedef realvec realvec_t;
-
- // Short names for type casts
- typedef real_t R;
- typedef int_t I;
- typedef uint_t U;
- typedef realvec_t RV;
- typedef intvec_t IV;
- typedef boolvec_t BV;
- typedef floatprops<real_t> FP;
- typedef mathfuncs<realvec_t> MF;
-
-
-
- vector_t v;
-
- realvec() {}
- // Can't have a non-trivial copy constructor; if so, objects won't
- // be passed in registers
- // realvec(realvec const& x): v(x.v) {}
- // realvec& operator=(realvec const& x) { return v=x.v, *this; }
- realvec(vector_t x): v(x) {}
- realvec(real_t a): v(vec_splats(a)) {}
- realvec(real_t const* as)
- {
- for (int d=0; d<size; ++d) set_elt(d, as[d]);
- }
-
- operator vector_t() const { return v; }
- real_t operator[](int n) const
- {
- return vecmathlib::get_elt<RV,vector_t,real_t>(v, n);
- }
- realvec_t& set_elt(int n, real_t a)
- {
- return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this;
- }
-
-
-
- typedef vecmathlib::mask_t<realvec_t> mask_t;
-
- static realvec_t loada(real_t const* p)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- return vec_xld2(0, (real_t*)p);
- }
- static realvec_t loadu(real_t const* p)
- {
- // TODO: Can this handle unaligned access?
- return vec_xld2(0, (real_t*)p);
- }
- static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff)
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff);
- return loadu(p+ioff);
- }
- realvec_t loada(real_t const* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(all(m.m), true)) {
- return loada(p);
- } else {
- return m.m.ifthen(loada(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- return loadu(p);
- } else {
- return m.m.ifthen(loadu(p), *this);
- }
- }
- realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return loada(p+ioff, m);
- return loadu(p+ioff, m);
- }
-
- void storea(real_t* p) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- vec_xstd2(v, 0, p);
- }
- void storeu(real_t* p) const
- {
- // Vector stores would require vector loads, which would need to
- // be atomic
- // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas
- p[0] = (*this)[0];
- p[1] = (*this)[1];
- }
- void storeu(real_t* p, std::ptrdiff_t ioff) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff);
- storeu(p+ioff);
- }
- void storea(real_t* p, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (__builtin_expect(m.all_m, true)) {
- storea(p);
- } else {
- // Use vec_ste?
- if (m.m[0]) p[0] = (*this)[0];
- if (m.m[1]) p[1] = (*this)[1];
- }
- }
- void storeu(real_t* p, mask_t const& m) const
- {
- if (__builtin_expect(m.all_m, true)) {
- storeu(p);
- } else {
- // Use vec_ste?
- if (m.m[0]) p[0] = (*this)[0];
- if (m.m[1]) p[1] = (*this)[1];
- }
- }
- void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const
- {
- VML_ASSERT(intptr_t(p) % alignment == 0);
- if (ioff % realvec::size == 0) return storea(p+ioff, m);
- storeu(p+ioff, m);
- }
-
-
-
- intvec_t as_int() const { return (__vector signed long long) v; }
- intvec_t convert_int() const { return MF::vml_convert_int(*this); }
-
-
-
- realvec operator+() const { return *this; }
- realvec operator-() const { return RV(0.0) - *this; }
-
- realvec operator+(realvec x) const { return vec_add(v, x.v); }
- realvec operator-(realvec x) const { return vec_sub(v, x.v); }
- realvec operator*(realvec x) const { return vec_mul(v, x.v); }
- realvec operator/(realvec x) const { return vec_div(v, x.v); }
-
- realvec& operator+=(realvec const& x) { return *this=*this+x; }
- realvec& operator-=(realvec const& x) { return *this=*this-x; }
- realvec& operator*=(realvec const& x) { return *this=*this*x; }
- realvec& operator/=(realvec const& x) { return *this=*this/x; }
-
- real_t maxval() const
- {
- return vml_std::fmax((*this)[0], (*this)[1]);
+template <> struct boolvec<double, 2>;
+template <> struct intvec<double, 2>;
+template <> struct realvec<double, 2>;
+
+template <> struct boolvec<double, 2> : floatprops<double> {
+ static int const size = 2;
+ typedef bool scalar_t;
+ typedef __vector __bool long long bvector_t;
+ static int const alignment = sizeof(bvector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+private:
+ // true values are -1, false values are 0
+ // truth values are interpreted bit-wise
+ static uint_t from_bool(bool a) { return -int_t(a); }
+ static bool to_bool(uint_t a) { return a; }
+
+public:
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x) : v(x) {}
+ boolvec(bool a)
+ : v((bvector_t)vec_splats((unsigned long long)from_bool(a))) {}
+ boolvec(bool const *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const {
+ return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n));
+ }
+ boolvec &set_elt(int n, bool a) {
+ return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)),
+ *this;
+ }
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+ boolvec operator!() const { return vec_nor(v, v); }
+
+ boolvec operator&&(boolvec x) const { return vec_and(v, x.v); }
+ boolvec operator||(boolvec x) const { return vec_or(v, x.v); }
+ boolvec operator==(boolvec x) const { return !(*this != x); }
+ boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); }
+
+ bool all() const { return vec_all_ne(v, BV(false)); }
+ bool any() const { return vec_any_ne(v, BV(false)); }
+
+ // ifthen(condition, then-value, else-value)
+ boolvec_t ifthen(boolvec_t x, boolvec_t y) const;
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+};
+
+template <> struct intvec<double, 2> : floatprops<double> {
+ static int const size = 2;
+ typedef int_t scalar_t;
+ typedef __vector signed long long ivector_t;
+ static int const alignment = sizeof(ivector_t);
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x) : v(x) {}
+ intvec(int_t a) : v(vec_splats((long long)a)) {}
+ intvec(int_t const *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+ static intvec iota() { return (__vector signed long long){0, 1}; }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const {
+ return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n);
+ }
+ intvec_t &set_elt(int n, int_t a) {
+ return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this;
+ }
+
+ // Vector casts do not change the bit battern
+ boolvec_t as_bool() const { return (__vector __bool long long)v; }
+ boolvec_t convert_bool() const { return *this != IV(I(0)); }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+ // Permutation control words
+private:
+ // 0123 4567 -> 1436
+ // exchange pairs
+ static __vector unsigned char perm_int_swap() {
+ return (__vector unsigned char){4, 5, 6, 7, 16, 17, 18, 19,
+ 12, 13, 14, 15, 24, 25, 26, 27};
+ }
+ // 0123 4567 -> 0426
+ // broadcast high elements of pairs
+ static __vector unsigned char perm_int_bchi() {
+ return (__vector unsigned char){0, 1, 2, 3, 16, 17, 18, 19,
+ 8, 9, 10, 11, 24, 25, 26, 27};
+ }
+
+public:
+ intvec operator+() const { return *this; }
+ intvec operator-() const { return vec_neg(v); }
+
+ intvec operator+(intvec x) const { return vec_add(v, x.v); }
+ intvec operator-(intvec x) const { return vec_sub(v, x.v); }
+ intvec operator*(intvec x) const { return vec_mul(v, x.v); }
+ intvec operator/(intvec x) const { return vec_div(v, x.v); }
+ intvec operator%(intvec x) const { return *this - *this / x * x; }
+
+ intvec &operator+=(intvec const &x) { return *this = *this + x; }
+ intvec &operator-=(intvec const &x) { return *this = *this - x; }
+ intvec &operator*=(intvec const &x) { return *this = *this * x; }
+ intvec &operator/=(intvec const &x) { return *this = *this / x; }
+ intvec &operator%=(intvec const &x) { return *this = *this % x; }
+
+ intvec operator~() const {
+ return (__vector signed long long)vec_nor((__vector signed int)v,
+ (__vector signed int)v);
+ }
+
+ intvec operator&(intvec x) const {
+ return (__vector signed long long)vec_and((__vector signed int)v,
+ (__vector signed int)x.v);
+ }
+ intvec operator|(intvec x) const {
+ return (__vector signed long long)vec_or((__vector signed int)v,
+ (__vector signed int)x.v);
+ }
+ intvec operator^(intvec x) const {
+ return (__vector signed long long)vec_xor((__vector signed int)v,
+ (__vector signed int)x.v);
+ }
+
+ intvec &operator&=(intvec const &x) { return *this = *this & x; }
+ intvec &operator|=(intvec const &x) { return *this = *this | x; }
+ intvec &operator^=(intvec const &x) { return *this = *this ^ x; }
+
+ intvec_t bitifthen(intvec_t x, intvec_t y) const;
+
+ intvec lsr(int_t n) const { return lsr(IV(n)); }
+ intvec_t rotate(int_t n) const;
+ intvec operator>>(int_t n) const { return *this >> IV(n); }
+ intvec operator<<(int_t n) const { return *this << IV(n); }
+ intvec &operator>>=(int_t n) { return *this = *this >> n; }
+ intvec &operator<<=(int_t n) { return *this = *this << n; }
+
+ intvec lsr(intvec n) const {
+ // return vec_sr(v, (__vector unsigned long long)n.v);
+ intvec r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, U((*this)[i]) >> U(n[i]));
}
- real_t minval() const
- {
- return vml_std::fmin((*this)[0], (*this)[1]);
+ return r;
+ }
+ intvec_t rotate(intvec_t n) const;
+ intvec operator>>(intvec n) const {
+ // return vec_sra(v, (__vector unsigned long long)n.v);
+ intvec r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] >> n[i]);
+ }
+ return r;
+ }
+ intvec operator<<(intvec n) const {
+ // return vec_sl(v, (__vector unsigned long long)n.v);
+ intvec r;
+ for (int i = 0; i < size; ++i) {
+ r.set_elt(i, (*this)[i] << n[i]);
}
- real_t prod() const
- {
- return (*this)[0] * (*this)[1];
+ return r;
+ }
+ intvec &operator>>=(intvec n) { return *this = *this >> n; }
+ intvec &operator<<=(intvec n) { return *this = *this << n; }
+
+ intvec_t clz() const;
+ intvec_t popcount() const;
+
+ boolvec_t operator==(intvec const &x) const {
+ // return vec_cmpeq(v, x.v);
+ __vector signed int a = (__vector signed int)v;
+ __vector signed int b = (__vector signed int)x.v;
+ __vector __bool int c = vec_cmpeq(a, b);
+ __vector __bool int cx = vec_perm(c, c, perm_int_swap());
+ __vector __bool int r = vec_and(c, cx);
+ return (__vector __bool long long)r;
+ }
+ boolvec_t operator!=(intvec const &x) const { return !(*this == x); }
+ boolvec_t operator<(intvec const &x) const {
+ __vector signed int a = (__vector signed int)v;
+ __vector signed int b = (__vector signed int)x.v;
+ __vector __bool int lt = vec_cmplt(a, b);
+ __vector __bool int eq = vec_cmpeq(a, b);
+ __vector unsigned int ua = (__vector unsigned int)v;
+ __vector unsigned int ub = (__vector unsigned int)x.v;
+ __vector __bool int ult = vec_cmplt(ua, ub);
+ __vector __bool int ultx = vec_perm(ult, ult, perm_int_swap());
+ __vector __bool int r = vec_or(lt, vec_and(eq, ultx));
+ r = vec_perm(r, r, perm_int_bchi());
+ return (__vector __bool long long)r;
+ }
+ boolvec_t operator<=(intvec const &x) const { return !(*this > x); }
+ boolvec_t operator>(intvec const &x) const { return x < *this; }
+ boolvec_t operator>=(intvec const &x) const { return !(*this < x); }
+
+ intvec_t abs() const;
+ boolvec_t isignbit() const { return (*this >> (bits - 1)).as_bool(); }
+ intvec_t max(intvec_t x) const;
+ intvec_t min(intvec_t x) const;
+};
+
+template <> struct realvec<double, 2> : floatprops<double> {
+ static int const size = 2;
+ typedef real_t scalar_t;
+ typedef __vector double vector_t;
+ static int const alignment = sizeof(vector_t);
+
+ static char const *name() { return "<VSX:2*double>"; }
+ void barrier() { __asm__("" : "+v"(v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x) : v(x) {}
+ realvec(real_t a) : v(vec_splats(a)) {}
+ realvec(real_t const *as) {
+ for (int d = 0; d < size; ++d)
+ set_elt(d, as[d]);
+ }
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const {
+ return vecmathlib::get_elt<RV, vector_t, real_t>(v, n);
+ }
+ realvec_t &set_elt(int n, real_t a) {
+ return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this;
+ }
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const *p) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ return vec_xld2(0, (real_t *)p);
+ }
+ static realvec_t loadu(real_t const *p) {
+ // TODO: Can this handle unaligned access?
+ return vec_xld2(0, (real_t *)p);
+ }
+ static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff);
+ return loadu(p + ioff);
+ }
+ realvec_t loada(real_t const *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
}
- real_t sum() const
- {
- return (*this)[0] + (*this)[1];
+ }
+ realvec_t loadu(real_t const *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
}
-
-
-
- boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); }
- boolvec_t operator!=(realvec const& x) const { return ! (*this == x); }
- boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); }
- boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); }
- boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); }
- boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); }
-
-
-
- realvec acos() const { return MF::vml_acos(*this); }
- realvec acosh() const { return MF::vml_acosh(*this); }
- realvec asin() const { return MF::vml_asin(*this); }
- realvec asinh() const { return MF::vml_asinh(*this); }
- realvec atan() const { return MF::vml_atan(*this); }
- realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
- realvec atanh() const { return MF::vml_atanh(*this); }
- realvec cbrt() const { return MF::vml_cbrt(*this); }
- realvec ceil() const { return vec_ceil(v); }
- realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
- realvec cos() const { return MF::vml_cos(*this); }
- realvec cosh() const { return MF::vml_cosh(*this); }
- realvec exp() const { return MF::vml_exp(*this); }
- realvec exp10() const { return MF::vml_exp10(*this); }
- realvec exp2() const { return MF::vml_exp2(*this); }
- realvec expm1() const { return MF::vml_expm1(*this); }
- realvec fabs() const { return vec_abs(v); }
- realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
- realvec floor() const { return vec_floor(v); }
- realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
- realvec fmax(realvec y) const { return vec_max(v, y.v); }
- realvec fmin(realvec y) const { return vec_min(v, y.v); }
- realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
- realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); }
- realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
- intvec_t ilogb() const { return MF::vml_ilogb(*this); }
- boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
- boolvec_t isinf() const { return MF::vml_isinf(*this); }
- boolvec_t isnan() const { return MF::vml_isnan(*this); }
- boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
- realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
- realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
- realvec log() const { return MF::vml_log(*this); }
- realvec log10() const { return MF::vml_log10(*this); }
- realvec log1p() const { return MF::vml_log1p(*this); }
- realvec log2() const { return MF::vml_log2(*this); }
- realvec_t mad(realvec_t y, realvec_t z) const
- {
- return MF::vml_mad(*this, y, z);
+ }
+ realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return loada(p + ioff, m);
+ return loadu(p + ioff, m);
+ }
+
+ void storea(real_t *p) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ vec_xstd2(v, 0, p);
+ }
+ void storeu(real_t *p) const {
+ // Vector stores would require vector loads, which would need to
+ // be atomic
+ // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html>
+ // for good ideas
+ p[0] = (*this)[0];
+ p[1] = (*this)[1];
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff);
+ storeu(p + ioff);
+ }
+ void storea(real_t *p, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ // Use vec_ste?
+ if (m.m[0])
+ p[0] = (*this)[0];
+ if (m.m[1])
+ p[1] = (*this)[1];
}
- realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
- realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
- realvec rcp() const
- {
- realvec x = *this;
- realvec r = vec_re(v); // this is only an approximation
- // TODO: use fma
- // Note: don't rewrite this expression, this may introduce
- // cancellation errors
- r += r * (RV(1.0) - x*r); // two Newton iterations (see vml_rcp)
- r += r * (RV(1.0) - x*r);
- return r;
+ }
+ void storeu(real_t *p, mask_t const &m) const {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ // Use vec_ste?
+ if (m.m[0])
+ p[0] = (*this)[0];
+ if (m.m[1])
+ p[1] = (*this)[1];
}
- realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
- realvec rint() const { return vec_round(v); /* sic! */}
- realvec round() const { return MF::vml_round(*this); }
- realvec rsqrt() const { return RV(1.0) / sqrt(); }
- boolvec_t signbit() const { return MF::vml_signbit(*this); }
- realvec sin() const { return MF::vml_sin(*this); }
- realvec sinh() const { return MF::vml_sinh(*this); }
- realvec sqrt() const { return vec_sqrt(v); }
- realvec tan() const { return MF::vml_tan(*this); }
- realvec tanh() const { return MF::vml_tanh(*this); }
- realvec trunc() const { return vec_trunc(v); }
- };
-
-
-
- // boolvec definitions
-
- inline intvec<double,2> boolvec<double,2>::as_int() const
- {
- return (__vector signed long long) v;
- }
-
- inline intvec<double,2> boolvec<double,2>::convert_int() const
- {
- return -(__vector signed long long)v;
- }
-
- inline
- boolvec<double,2> boolvec<double,2>::ifthen(boolvec_t x, boolvec_t y) const
- {
- return vec_sel(y.v, x.v, v);
- }
-
- inline
- intvec<double,2> boolvec<double,2>::ifthen(intvec_t x, intvec_t y) const
- {
- return vec_sel(y.v, x.v, v);
- }
-
- inline
- realvec<double,2> boolvec<double,2>::ifthen(realvec_t x, realvec_t y) const
- {
- return vec_sel(y.v, x.v, v);
- }
-
-
-
- // intvec definitions
-
- inline intvec<double,2> intvec<double,2>::abs() const
- {
- return MF::vml_abs(*this);
- }
-
- inline realvec<double,2> intvec<double,2>::as_float() const
- {
- return (__vector double)v;
- }
-
- inline intvec<double,2> intvec<double,2>::bitifthen(intvec_t x,
- intvec_t y) const
- {
- return MF::vml_bitifthen(*this, x, y);
- }
-
- inline intvec<double,2> intvec<double,2>::clz() const
- {
- return MF::vml_clz(*this);
- }
-
- inline realvec<double,2> intvec<double,2>::convert_float() const
- {
- // return vec_ctd(v, 0);
- return MF::vml_convert_float(*this);
- }
-
- inline intvec<double,2> intvec<double,2>::max(intvec_t x) const
- {
- return MF::vml_max(*this, x);
- }
-
- inline intvec<double,2> intvec<double,2>::min(intvec_t x) const
- {
- return MF::vml_min(*this, x);
- }
-
- inline intvec<double,2> intvec<double,2>::popcount() const
- {
- return MF::vml_popcount(*this);
- }
-
- inline intvec<double,2> intvec<double,2>::rotate(int_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
- inline intvec<double,2> intvec<double,2>::rotate(intvec_t n) const
- {
- return MF::vml_rotate(*this, n);
- }
-
+ }
+ void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const {
+ VML_ASSERT(intptr_t(p) % alignment == 0);
+ if (ioff % realvec::size == 0)
+ return storea(p + ioff, m);
+ storeu(p + ioff, m);
+ }
+
+ intvec_t as_int() const { return (__vector signed long long)v; }
+ intvec_t convert_int() const { return MF::vml_convert_int(*this); }
+
+ realvec operator+() const { return *this; }
+ realvec operator-() const { return RV(0.0) - *this; }
+
+ realvec operator+(realvec x) const { return vec_add(v, x.v); }
+ realvec operator-(realvec x) const { return vec_sub(v, x.v); }
+ realvec operator*(realvec x) const { return vec_mul(v, x.v); }
+ realvec operator/(realvec x) const { return vec_div(v, x.v); }
+
+ realvec &operator+=(realvec const &x) { return *this = *this + x; }
+ realvec &operator-=(realvec const &x) { return *this = *this - x; }
+ realvec &operator*=(realvec const &x) { return *this = *this * x; }
+ realvec &operator/=(realvec const &x) { return *this = *this / x; }
+
+ real_t maxval() const { return vml_std::fmax((*this)[0], (*this)[1]); }
+ real_t minval() const { return vml_std::fmin((*this)[0], (*this)[1]); }
+ real_t prod() const { return (*this)[0] * (*this)[1]; }
+ real_t sum() const { return (*this)[0] + (*this)[1]; }
+
+ boolvec_t operator==(realvec const &x) const { return vec_cmpeq(v, x.v); }
+ boolvec_t operator!=(realvec const &x) const { return !(*this == x); }
+ boolvec_t operator<(realvec const &x) const { return vec_cmplt(v, x.v); }
+ boolvec_t operator<=(realvec const &x) const { return vec_cmple(v, x.v); }
+ boolvec_t operator>(realvec const &x) const { return vec_cmpgt(v, x.v); }
+ boolvec_t operator>=(realvec const &x) const { return vec_cmpge(v, x.v); }
+
+ realvec acos() const { return MF::vml_acos(*this); }
+ realvec acosh() const { return MF::vml_acosh(*this); }
+ realvec asin() const { return MF::vml_asin(*this); }
+ realvec asinh() const { return MF::vml_asinh(*this); }
+ realvec atan() const { return MF::vml_atan(*this); }
+ realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); }
+ realvec atanh() const { return MF::vml_atanh(*this); }
+ realvec cbrt() const { return MF::vml_cbrt(*this); }
+ realvec ceil() const { return vec_ceil(v); }
+ realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+ realvec cos() const { return MF::vml_cos(*this); }
+ realvec cosh() const { return MF::vml_cosh(*this); }
+ realvec exp() const { return MF::vml_exp(*this); }
+ realvec exp10() const { return MF::vml_exp10(*this); }
+ realvec exp2() const { return MF::vml_exp2(*this); }
+ realvec expm1() const { return MF::vml_expm1(*this); }
+ realvec fabs() const { return vec_abs(v); }
+ realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); }
+ realvec floor() const { return vec_floor(v); }
+ realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); }
+ realvec fmax(realvec y) const { return vec_max(v, y.v); }
+ realvec fmin(realvec y) const { return vec_min(v, y.v); }
+ realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); }
+ realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
+ realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const { return MF::vml_isnan(*this); }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec log() const { return MF::vml_log(*this); }
+ realvec log10() const { return MF::vml_log10(*this); }
+ realvec log1p() const { return MF::vml_log1p(*this); }
+ realvec log2() const { return MF::vml_log2(*this); }
+ realvec_t mad(realvec_t y, realvec_t z) const {
+ return MF::vml_mad(*this, y, z);
+ }
+ realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); }
+ realvec pow(realvec y) const { return MF::vml_pow(*this, y); }
+ realvec rcp() const {
+ realvec x = *this;
+ realvec r = vec_re(v); // this is only an approximation
+ // TODO: use fma
+ // Note: don't rewrite this expression, this may introduce
+ // cancellation errors
+ r += r * (RV(1.0) - x * r); // two Newton iterations (see vml_rcp)
+ r += r * (RV(1.0) - x * r);
+ return r;
+ }
+ realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); }
+ realvec rint() const { return vec_round(v); /* sic! */ }
+ realvec round() const { return MF::vml_round(*this); }
+ realvec rsqrt() const { return RV(1.0) / sqrt(); }
+ boolvec_t signbit() const { return MF::vml_signbit(*this); }
+ realvec sin() const { return MF::vml_sin(*this); }
+ realvec sinh() const { return MF::vml_sinh(*this); }
+ realvec sqrt() const { return vec_sqrt(v); }
+ realvec tan() const { return MF::vml_tan(*this); }
+ realvec tanh() const { return MF::vml_tanh(*this); }
+ realvec trunc() const { return vec_trunc(v); }
+};
+
+// boolvec definitions
+
+inline intvec<double, 2> boolvec<double, 2>::as_int() const {
+ return (__vector signed long long)v;
+}
+
+inline intvec<double, 2> boolvec<double, 2>::convert_int() const {
+ return -(__vector signed long long)v;
+}
+
+inline boolvec<double, 2> boolvec<double, 2>::ifthen(boolvec_t x,
+ boolvec_t y) const {
+ return vec_sel(y.v, x.v, v);
+}
+
+inline intvec<double, 2> boolvec<double, 2>::ifthen(intvec_t x,
+ intvec_t y) const {
+ return vec_sel(y.v, x.v, v);
+}
+
+inline realvec<double, 2> boolvec<double, 2>::ifthen(realvec_t x,
+ realvec_t y) const {
+ return vec_sel(y.v, x.v, v);
+}
+
+// intvec definitions
+
+inline intvec<double, 2> intvec<double, 2>::abs() const {
+ return MF::vml_abs(*this);
+}
+
+inline realvec<double, 2> intvec<double, 2>::as_float() const {
+ return (__vector double)v;
+}
+
+inline intvec<double, 2> intvec<double, 2>::bitifthen(intvec_t x,
+ intvec_t y) const {
+ return MF::vml_bitifthen(*this, x, y);
+}
+
+inline intvec<double, 2> intvec<double, 2>::clz() const {
+ return MF::vml_clz(*this);
+}
+
+inline realvec<double, 2> intvec<double, 2>::convert_float() const {
+ // return vec_ctd(v, 0);
+ return MF::vml_convert_float(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::max(intvec_t x) const {
+ return MF::vml_max(*this, x);
+}
+
+inline intvec<double, 2> intvec<double, 2>::min(intvec_t x) const {
+ return MF::vml_min(*this, x);
+}
+
+inline intvec<double, 2> intvec<double, 2>::popcount() const {
+ return MF::vml_popcount(*this);
+}
+
+inline intvec<double, 2> intvec<double, 2>::rotate(int_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
+inline intvec<double, 2> intvec<double, 2>::rotate(intvec_t n) const {
+ return MF::vml_rotate(*this, n);
+}
+
} // namespace vecmathlib
-#endif // #ifndef VEC_VSX_DOUBLE2_H
+#endif // #ifndef VEC_VSX_DOUBLE2_H
diff --git a/vecmathlib.h b/vecmathlib.h
index 9accd24..0d72add 100644
--- a/vecmathlib.h
+++ b/vecmathlib.h
@@ -4,16 +4,14 @@
#define VECMATHLIB_H
#if defined VML_DEBUG || defined VML_NODEBUG
-# if defined VML_DEBUG && defined VML_NODEBUG
-# error "Only one of VML_DEBUG or VML_NODEBUG may be defined"
-# endif
+#if defined VML_DEBUG && defined VML_NODEBUG
+#error "Only one of VML_DEBUG or VML_NODEBUG may be defined"
+#endif
#else
// default
-# define VML_DEBUG
+#define VML_DEBUG
#endif
-
-
// FP settings
// Possible effects of not having VML_HAVE_FP_CONTRACT:
@@ -23,7 +21,7 @@
// - can evaluate functions with reduced precision (80% of significant digits)
// default settings
-#undef VML_HAVE_DENORMALS // TODO
+#undef VML_HAVE_DENORMALS // TODO
#define VML_HAVE_FP_CONTRACT
#define VML_HAVE_INF
#define VML_HAVE_NAN
@@ -31,63 +29,59 @@
// optimized settings
#ifdef __FAST_MATH__
-# undef VML_HAVE_DENORMALS
-# undef VML_HAVE_FP_CONTRACT
-# undef VML_HAVE_INF
-# undef VML_HAVE_NAN
+#undef VML_HAVE_DENORMALS
+#undef VML_HAVE_FP_CONTRACT
+#undef VML_HAVE_INF
+#undef VML_HAVE_NAN
#endif
#ifdef VML_DEBUG
-# define VML_CONFIG_DEBUG " debug"
+#define VML_CONFIG_DEBUG " debug"
#else
-# define VML_CONFIG_DEBUG " no-debug"
+#define VML_CONFIG_DEBUG " no-debug"
#endif
#ifdef VML_DENORMALS
-# define VML_CONFIG_DENORMALS " denormals"
+#define VML_CONFIG_DENORMALS " denormals"
#else
-# define VML_CONFIG_DENORMALS " no-denormals"
+#define VML_CONFIG_DENORMALS " no-denormals"
#endif
#ifdef VML_FP_CONTRACT
-# define VML_CONFIG_FP_CONTRACT " fp-contract"
+#define VML_CONFIG_FP_CONTRACT " fp-contract"
#else
-# define VML_CONFIG_FP_CONTRACT " no-fp-contract"
+#define VML_CONFIG_FP_CONTRACT " no-fp-contract"
#endif
#ifdef VML_INF
-# define VML_CONFIG_INF " inf"
+#define VML_CONFIG_INF " inf"
#else
-# define VML_CONFIG_INF " no-inf"
+#define VML_CONFIG_INF " no-inf"
#endif
#ifdef VML_NAN
-# define VML_CONFIG_NAN " nan"
+#define VML_CONFIG_NAN " nan"
#else
-# define VML_CONFIG_NAN " no-nan"
+#define VML_CONFIG_NAN " no-nan"
#endif
// TODO: introduce mad, as fast version of fma (check FP_FAST_FMA)
// TODO: introduce ieee_isnan and friends
// TODO: switch between isnan and ieee_isnan at an outside level
-
-
// This workaround is needed for older libstdc++ versions such as the
// one in Debian 6.0 when compiled with clang++
// <http://lists.cs.uiuc.edu/pipermail/cfe-dev/2011-February/013207.html>.
// The version time stamp used below is the one in Debian 6.0.
-#include <cstring> // pull in __GLIBCXX__
+#include <cstring> // pull in __GLIBCXX__
#if defined __GLIBCXX__ && __GLIBCXX__ <= 20101114
-namespace std { class type_info; }
+namespace std {
+class type_info;
+}
#endif
-
-
#include <cassert>
-
-
#ifdef VML_DEBUG
-# define VML_ASSERT(x) assert(x)
+#define VML_ASSERT(x) assert(x)
#else
-# define VML_ASSERT(x) ((void)0)
+#define VML_ASSERT(x) ((void)0)
#endif
// Scalarise all vector operations, and use libm's functions (mostly
@@ -96,146 +90,142 @@ namespace std { class type_info; }
#ifdef __clang__
// Use compiler-provided vector types
-# include "vec_builtin.h"
+#include "vec_builtin.h"
#endif
// Scalarise all vector operations; don't use libm, use only
// Vecmathlib's functions (mostly useful for testing Vecmathlib)
#include "vec_test.h"
-#if defined __ARM_NEON__ // ARM NEON
-# include "vec_neon_float2.h"
-# include "vec_neon_float4.h"
-# define VML_CONFIG_NEON " NEON"
-#else
-# define VML_CONFIG_NEON
-#endif
-
-#if defined __SSE2__ // Intel SSE 2
-# include "vec_sse_float1.h"
-# include "vec_sse_float4.h"
-# include "vec_sse_double1.h"
-# include "vec_sse_double2.h"
-# if defined __SSE3__
-# define VML_CONFIG_SSE3 " SSE3"
-# else
-# define VML_CONFIG_SSE3
-# endif
-# if defined __SSSE3__
-# define VML_CONFIG_SSSE3 " SSSE3"
-# else
-# define VML_CONFIG_SSSE3
-# endif
-# if defined __SSE4_1__
-# define VML_CONFIG_SSE4_1 " SSE4.1"
-# else
-# define VML_CONFIG_SSE4_1
-# endif
-# if defined __SSE4a__
-# define VML_CONFIG_SSE4a " SSE4a"
-# else
-# define VML_CONFIG_SSE4a
-# endif
-# define VML_CONFIG_SSE2 " SSE2" VML_CONFIG_SSE3 VML_CONFIG_SSSE3 VML_CONFIG_SSE4_1 VML_CONFIG_SSE4a
-#else
-# define VML_CONFIG_SSE2
-#endif
-
-#if defined __AVX__ // Intel AVX
-# include "vec_avx_fp8_32.h"
-# include "vec_avx_fp16_16.h"
-# include "vec_avx_float8.h"
-# include "vec_avx_double4.h"
-# define VML_CONFIG_AVX " AVX"
-#else
-# define VML_CONFIG_AVX
-#endif
-
-#if defined __MIC__ // Intel MIC
+#if defined __ARM_NEON__ // ARM NEON
+#include "vec_neon_float2.h"
+#include "vec_neon_float4.h"
+#define VML_CONFIG_NEON " NEON"
+#else
+#define VML_CONFIG_NEON
+#endif
+
+#if defined __SSE2__ // Intel SSE 2
+#include "vec_sse_float1.h"
+#include "vec_sse_float4.h"
+#include "vec_sse_double1.h"
+#include "vec_sse_double2.h"
+#if defined __SSE3__
+#define VML_CONFIG_SSE3 " SSE3"
+#else
+#define VML_CONFIG_SSE3
+#endif
+#if defined __SSSE3__
+#define VML_CONFIG_SSSE3 " SSSE3"
+#else
+#define VML_CONFIG_SSSE3
+#endif
+#if defined __SSE4_1__
+#define VML_CONFIG_SSE4_1 " SSE4.1"
+#else
+#define VML_CONFIG_SSE4_1
+#endif
+#if defined __SSE4a__
+#define VML_CONFIG_SSE4a " SSE4a"
+#else
+#define VML_CONFIG_SSE4a
+#endif
+#define VML_CONFIG_SSE2 \
+ " SSE2" VML_CONFIG_SSE3 VML_CONFIG_SSSE3 VML_CONFIG_SSE4_1 VML_CONFIG_SSE4a
+#else
+#define VML_CONFIG_SSE2
+#endif
+
+#if defined __AVX__ // Intel AVX
+#include "vec_avx_fp8_32.h"
+#include "vec_avx_fp16_16.h"
+#include "vec_avx_float8.h"
+#include "vec_avx_double4.h"
+#define VML_CONFIG_AVX " AVX"
+#else
+#define VML_CONFIG_AVX
+#endif
+
+#if defined __MIC__ // Intel MIC
// TODO: single precision?
-# include "vec_mic_double8.h"
-# define VML_CONFIG_MIC " MIC"
+#include "vec_mic_double8.h"
+#define VML_CONFIG_MIC " MIC"
#else
-# define VML_CONFIG_MIC
+#define VML_CONFIG_MIC
#endif
-#if defined __ALTIVEC__ // IBM Altivec
-# include "vec_altivec_float4.h"
-# define VML_CONFIG_ALTIVEC " Altivec"
+#if defined __ALTIVEC__ // IBM Altivec
+#include "vec_altivec_float4.h"
+#define VML_CONFIG_ALTIVEC " Altivec"
#else
-# define VML_CONFIG_ALTIVEC
+#define VML_CONFIG_ALTIVEC
#endif
#if defined __ALTIVEC__ && defined _ARCH_PWR7 // IBM VSX
-# include "vec_vsx_double2.h"
-# define VML_CONFIG_VSX " VSX"
+#include "vec_vsx_double2.h"
+#define VML_CONFIG_VSX " VSX"
#else
-# define VML_CONFIG_VSX
+#define VML_CONFIG_VSX
#endif
// TODO: IBM Blue Gene/P DoubleHummer
#if defined __bgq__ && defined __VECTOR4DOUBLE__ // IBM Blue Gene/Q QPX
// TODO: vec_qpx_float4
-# include "vec_qpx_double4.h"
-# define VML_CONFIG_QPX " QPX"
+#include "vec_qpx_double4.h"
+#define VML_CONFIG_QPX " QPX"
#else
-# define VML_CONFIG_QPX
+#define VML_CONFIG_QPX
#endif
-#define VECMATHLIB_CONFIGURATION \
- "VecmathlibConfiguration" \
- VML_CONFIG_DEBUG \
- VML_CONFIG_DENORMALS VML_CONFIG_FP_CONTRACT VML_CONFIG_INF VML_CONFIG_NAN \
- VML_CONFIG_NEON \
- VML_CONFIG_SSE2 VML_CONFIG_AVX VML_CONFIG_MIC \
- VML_CONFIG_ALTIVEC VML_CONFIG_VSX \
- VML_CONFIG_QPX
-
-
+#define VECMATHLIB_CONFIGURATION \
+ "VecmathlibConfiguration" VML_CONFIG_DEBUG VML_CONFIG_DENORMALS \
+ VML_CONFIG_FP_CONTRACT VML_CONFIG_INF VML_CONFIG_NAN VML_CONFIG_NEON \
+ VML_CONFIG_SSE2 VML_CONFIG_AVX VML_CONFIG_MIC VML_CONFIG_ALTIVEC \
+ VML_CONFIG_VSX VML_CONFIG_QPX
// Define "best" vector types
namespace vecmathlib {
-
+
#if defined VECMATHLIB_HAVE_VEC_FLOAT_16
-# define VECMATHLIB_MAX_FLOAT_VECSIZE 16
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 16
#elif defined VECMATHLIB_HAVE_VEC_FLOAT_8
-# define VECMATHLIB_MAX_FLOAT_VECSIZE 8
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 8
#elif defined VECMATHLIB_HAVE_VEC_FLOAT_4
-# define VECMATHLIB_MAX_FLOAT_VECSIZE 4
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 4
#elif defined VECMATHLIB_HAVE_VEC_FLOAT_2
-# define VECMATHLIB_MAX_FLOAT_VECSIZE 2
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 2
#elif defined VECMATHLIB_HAVE_VEC_FLOAT_1
-# define VECMATHLIB_MAX_FLOAT_VECSIZE 1
+#define VECMATHLIB_MAX_FLOAT_VECSIZE 1
#endif
-
+
#if defined VECMATHLIB_HAVE_VEC_DOUBLE_8
-# define VECMATHLIB_MAX_DOUBLE_VECSIZE 8
+#define VECMATHLIB_MAX_DOUBLE_VECSIZE 8
#elif defined VECMATHLIB_HAVE_VEC_DOUBLE_4
-# define VECMATHLIB_MAX_DOUBLE_VECSIZE 4
+#define VECMATHLIB_MAX_DOUBLE_VECSIZE 4
#elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2
-# define VECMATHLIB_MAX_DOUBLE_VECSIZE 2
+#define VECMATHLIB_MAX_DOUBLE_VECSIZE 2
#elif defined VECMATHLIB_HAVE_VEC_DOUBLE_1
-# define VECMATHLIB_MAX_DOUBLE_VECSIZE 1
+#define VECMATHLIB_MAX_DOUBLE_VECSIZE 1
#endif
-
+
#ifdef VECMATHLIB_MAX_FLOAT_VECSIZE
- typedef realvec<float,VECMATHLIB_MAX_FLOAT_VECSIZE> float32_vec;
- typedef intvec<float,VECMATHLIB_MAX_FLOAT_VECSIZE> int32_vec;
- typedef boolvec<float,VECMATHLIB_MAX_FLOAT_VECSIZE> bool32_vec;
+typedef realvec<float, VECMATHLIB_MAX_FLOAT_VECSIZE> float32_vec;
+typedef intvec<float, VECMATHLIB_MAX_FLOAT_VECSIZE> int32_vec;
+typedef boolvec<float, VECMATHLIB_MAX_FLOAT_VECSIZE> bool32_vec;
#else
- typedef realpseudovec<float,1> float32_vec;
- typedef intpseudovec<float,1> int32_vec;
- typedef boolpseudovec<float,1> bool32_vec;
+typedef realpseudovec<float, 1> float32_vec;
+typedef intpseudovec<float, 1> int32_vec;
+typedef boolpseudovec<float, 1> bool32_vec;
#endif
-
+
#ifdef VECMATHLIB_MAX_DOUBLE_VECSIZE
- typedef realvec<double,VECMATHLIB_MAX_DOUBLE_VECSIZE> float64_vec;
- typedef intvec<double,VECMATHLIB_MAX_DOUBLE_VECSIZE> int64_vec;
- typedef boolvec<double,VECMATHLIB_MAX_DOUBLE_VECSIZE> bool64_vec;
+typedef realvec<double, VECMATHLIB_MAX_DOUBLE_VECSIZE> float64_vec;
+typedef intvec<double, VECMATHLIB_MAX_DOUBLE_VECSIZE> int64_vec;
+typedef boolvec<double, VECMATHLIB_MAX_DOUBLE_VECSIZE> bool64_vec;
#else
- typedef realpseudovec<double,1> float64_vec;
- typedef intpseudovec<double,1> int64_vec;
- typedef boolpseudovec<double,1> bool64_vec;
+typedef realpseudovec<double, 1> float64_vec;
+typedef intpseudovec<double, 1> int64_vec;
+typedef boolpseudovec<double, 1> bool64_vec;
#endif
}
OpenPOWER on IntegriCloud