diff options
44 files changed, 15431 insertions, 17970 deletions
@@ -16,47 +16,38 @@ using namespace std; using namespace vecmathlib; - - #ifndef __has_builtin -# define __has_builtin(x) 0 // Compatibility with non-clang compilers +#define __has_builtin(x) 0 // Compatibility with non-clang compilers #endif - - typedef unsigned long long ticks; -inline ticks getticks() -{ +inline ticks getticks() { #if __has_builtin(__builtin_readcyclecounter) return __builtin_readcyclecounter(); #elif defined __x86_64__ ticks a, d; - asm volatile("rdtsc" : "=a" (a), "=d" (d)); + asm volatile("rdtsc" : "=a"(a), "=d"(d)); return a | (d << 32); #elif defined __powerpc__ unsigned int tbl, tbu, tbu1; do { - asm volatile("mftbu %0": "=r"(tbu)); - asm volatile("mftb %0": "=r"(tbl)); - asm volatile("mftbu %0": "=r"(tbu1)); + asm volatile("mftbu %0" : "=r"(tbu)); + asm volatile("mftb %0" : "=r"(tbl)); + asm volatile("mftbu %0" : "=r"(tbu1)); } while (tbu != tbu1); return ((unsigned long long)tbu << 32) | tbl; #else timeval tv; gettimeofday(&tv, NULL); return 1000000ULL * tv.tv_sec + tv.tv_usec; - // timespec ts; - // clock_gettime(CLOCK_REALTIME, &ts); - // return 1000000000ULL * ts.tv_sec + ts.tv_nsec; +// timespec ts; +// clock_gettime(CLOCK_REALTIME, &ts); +// return 1000000000ULL * ts.tv_sec + ts.tv_nsec; #endif } -inline double elapsed(ticks t1, ticks t0) -{ - return t1-t0; -} +inline double elapsed(ticks t1, ticks t0) { return t1 - t0; } -double get_sys_time() -{ +double get_sys_time() { timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + 1.0e-6 * tv.tv_usec; @@ -65,8 +56,7 @@ double get_sys_time() // return ts.tv_sec + 1.0e-9 * ts.tv_nsec; } -double measure_tick() -{ +double measure_tick() { ticks const rstart = getticks(); double const wstart = get_sys_time(); while (get_sys_time() - wstart < 0.1) { @@ -74,124 +64,103 @@ double measure_tick() } ticks const rend = getticks(); double const wend = get_sys_time(); - assert(wend-wstart >= 0.09); + assert(wend - wstart >= 0.09); return (wend - wstart) / elapsed(rend, rstart); } - - double global_result = 0.0; -template<typename realvec_t> -void save_result(realvec_t result) -{ - for (int i=0; i<realvec_t::size; ++i) { +template <typename realvec_t> void save_result(realvec_t result) { + for (int i = 0; i < realvec_t::size; ++i) { global_result += result[i]; } // Check global accumulator to prevent optimisation - if (! vml_std::isfinite(global_result)) { + if (!vml_std::isfinite(global_result)) { cout << "\n" << "WARNING: Global accumulator is not finite\n"; } } +template <typename T> inline T nop(T x) { return x; } +template <typename T> inline T fneg(T x) { return -x; } -template<typename T> inline T nop(T x) { return x; } - -template<typename T> inline T fneg(T x) { return -x; } +template <typename T> inline T fadd(T x, T y) { return x + y; } +template <typename T> inline T fsub(T x, T y) { return x - y; } +template <typename T> inline T fmul(T x, T y) { return x * y; } +template <typename T> inline T fdiv(T x, T y) { return x / y; } -template<typename T> inline T fadd(T x, T y) { return x+y; } -template<typename T> inline T fsub(T x, T y) { return x-y; } -template<typename T> inline T fmul(T x, T y) { return x*y; } -template<typename T> inline T fdiv(T x, T y) { return x/y; } - -template<typename T> inline T frexp0(T x) -{ +template <typename T> inline T frexp0(T x) { typename T::intvec_t ir; return frexp(x, &ir); } -template<typename T> inline typename T::intvec_t frexp1(T x) -{ +template <typename T> inline typename T::intvec_t frexp1(T x) { typename T::intvec_t ir; frexp(x, &ir); return ir; } -template<typename T> inline T ldexps(T x, T y) -{ +template <typename T> inline T ldexps(T x, T y) { typename T::intvec_t iy = convert_int(y); return ldexp(x, iy[0]); } -template<typename T> inline T ldexpv(T x, T y) -{ +template <typename T> inline T ldexpv(T x, T y) { typename T::intvec_t iy = convert_int(y); return ldexp(x, iy); } - - -#define DECLARE_FUNCTOR(FUNC, XMIN, XMAX) \ - template<typename T> \ - struct functor_##FUNC { \ - static typename T::real_t get_xmin() { return XMIN; } \ - static typename T::real_t get_xmax() { return XMAX; } \ - static const char* name() { return #FUNC; } \ - T operator()(T x) { \ - return FUNC(x); \ - } \ +#define DECLARE_FUNCTOR(FUNC, XMIN, XMAX) \ + template <typename T> struct functor_##FUNC { \ + static typename T::real_t get_xmin() { return XMIN; } \ + static typename T::real_t get_xmax() { return XMAX; } \ + static const char *name() { return #FUNC; } \ + T operator()(T x) { return FUNC(x); } \ } -#define DECLARE_BFUNCTOR(FUNC, XMIN, XMAX) \ - template<typename T> \ - struct functor_##FUNC { \ - static typename T::real_t get_xmin() { return XMIN; } \ - static typename T::real_t get_xmax() { return XMAX; } \ - static const char* name() { return #FUNC; } \ - T operator()(T x) { \ - typename T::boolvec_t res = FUNC(x); \ - return convert_float(convert_int(res)); \ - } \ +#define DECLARE_BFUNCTOR(FUNC, XMIN, XMAX) \ + template <typename T> struct functor_##FUNC { \ + static typename T::real_t get_xmin() { return XMIN; } \ + static typename T::real_t get_xmax() { return XMAX; } \ + static const char *name() { return #FUNC; } \ + T operator()(T x) { \ + typename T::boolvec_t res = FUNC(x); \ + return convert_float(convert_int(res)); \ + } \ } -#define DECLARE_IFUNCTOR(FUNC, XMIN, XMAX) \ - template<typename T> \ - struct functor_##FUNC { \ - static typename T::real_t get_xmin() { return XMIN; } \ - static typename T::real_t get_xmax() { return XMAX; } \ - static const char* name() { return #FUNC; } \ - T operator()(T x) { \ - typename T::intvec_t res = FUNC(x); \ - return convert_float(res); \ - } \ +#define DECLARE_IFUNCTOR(FUNC, XMIN, XMAX) \ + template <typename T> struct functor_##FUNC { \ + static typename T::real_t get_xmin() { return XMIN; } \ + static typename T::real_t get_xmax() { return XMAX; } \ + static const char *name() { return #FUNC; } \ + T operator()(T x) { \ + typename T::intvec_t res = FUNC(x); \ + return convert_float(res); \ + } \ } -#define DECLARE_FUNCTOR2(FUNC, XMIN, XMAX, YOFFSET) \ - template<typename T> \ - struct functor_##FUNC { \ - static typename T::real_t get_xmin() { return XMIN; } \ - static typename T::real_t get_xmax() { return XMAX; } \ - static const char* name() { return #FUNC; } \ - T operator()(T x) { \ - const typename T::real_t yoffset = YOFFSET; \ - return FUNC(x, x + T(yoffset)); \ - } \ +#define DECLARE_FUNCTOR2(FUNC, XMIN, XMAX, YOFFSET) \ + template <typename T> struct functor_##FUNC { \ + static typename T::real_t get_xmin() { return XMIN; } \ + static typename T::real_t get_xmax() { return XMAX; } \ + static const char *name() { return #FUNC; } \ + T operator()(T x) { \ + const typename T::real_t yoffset = YOFFSET; \ + return FUNC(x, x + T(yoffset)); \ + } \ } -#define DECLARE_FUNCTOR3(FUNC, XMIN, XMAX, YOFFSET, ZOFFSET) \ - template<typename T> \ - struct functor_##FUNC { \ - static typename T::real_t get_xmin() { return XMIN; } \ - static typename T::real_t get_xmax() { return XMAX; } \ - static const char* name() { return #FUNC; } \ - T operator()(T x) { \ - const typename T::real_t yoffset = YOFFSET; \ - const typename T::real_t zoffset = ZOFFSET; \ - return FUNC(x, x + T(yoffset), x + T(zoffset)); \ - } \ +#define DECLARE_FUNCTOR3(FUNC, XMIN, XMAX, YOFFSET, ZOFFSET) \ + template <typename T> struct functor_##FUNC { \ + static typename T::real_t get_xmin() { return XMIN; } \ + static typename T::real_t get_xmax() { return XMAX; } \ + static const char *name() { return #FUNC; } \ + T operator()(T x) { \ + const typename T::real_t yoffset = YOFFSET; \ + const typename T::real_t zoffset = ZOFFSET; \ + return FUNC(x, x + T(yoffset), x + T(zoffset)); \ + } \ } - - DECLARE_FUNCTOR(nop, 0.0, 1.0); DECLARE_FUNCTOR(fneg, 0.0, 1.0); @@ -252,137 +221,127 @@ DECLARE_FUNCTOR(tan, 0.0, 1.0); DECLARE_FUNCTOR(tanh, -1.0, +1.0); DECLARE_FUNCTOR(trunc, -1.0, +1.0); - - -template<typename realvec_t, template<typename> class func_t> -double run_bench() -{ +template <typename realvec_t, template <typename> class func_t> +double run_bench() { const int numiters = 1000000; - + typedef typename realvec_t::real_t real_t; const real_t xmin = func_t<realvec_t>::get_xmin(); const real_t xmax = func_t<realvec_t>::get_xmax(); realvec_t x0, dx; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { x0.set_elt(i, xmin + (xmax - xmin) / numiters * i / realvec_t::size); dx.set_elt(i, (xmax - xmin) / numiters); } realvec_t x, y; ticks t0, t1; double const cycles_per_tick = 1.0; // measure_tick(); - + func_t<realvec_t> func; t0 = getticks(); x = y = x0; - for (int n=0; n<numiters; ++n) { + for (int n = 0; n < numiters; ++n) { y += func(x); x += dx; } t1 = getticks(); save_result(y); - - return cycles_per_tick * elapsed(t1,t0) * realvec_t::size / numiters; + + return cycles_per_tick * elapsed(t1, t0) * realvec_t::size / numiters; } -template<typename realvec_t, template<typename> class func_t> -void bench_type_func() -{ - cout << " " - << setw(-5) << func_t<realvec_t>::name() << " " - << setw(18) << realvec_t::name() << ": " << flush; +template <typename realvec_t, template <typename> class func_t> +void bench_type_func() { + cout << " " << setw(-5) << func_t<realvec_t>::name() << " " << setw(18) + << realvec_t::name() << ": " << flush; double const cycles = run_bench<realvec_t, func_t>(); cout << cycles << " cycles\n" << flush; } -template<template<typename> class func_t> -void bench_func() -{ +template <template <typename> class func_t> void bench_func() { cout << "\n" << "Benchmarking " << func_t<float32_vec>().name() << ":\n"; - + // Note: We benchmark neither testvec (since this is known to be // slow), nor builtinvec (since this has about the same performance // as pseudovec, and is also not very efficient). - - bench_type_func<realpseudovec<float,1>, func_t>(); + + bench_type_func<realpseudovec<float, 1>, func_t>(); #ifdef __clang__ - bench_type_func<realbuiltinvec<float,1>, func_t>(); + bench_type_func<realbuiltinvec<float, 1>, func_t>(); #endif - bench_type_func<realtestvec<float,1>, func_t>(); + bench_type_func<realtestvec<float, 1>, func_t>(); #ifdef VECMATHLIB_HAVE_VEC_FLOAT_1 - bench_type_func<realvec<float,1>, func_t>(); + bench_type_func<realvec<float, 1>, func_t>(); #endif #ifdef VECMATHLIB_HAVE_VEC_FLOAT_2 - bench_type_func<realpseudovec<float,2>, func_t>(); + bench_type_func<realpseudovec<float, 2>, func_t>(); #ifdef __clang__ - bench_type_func<realbuiltinvec<float,2>, func_t>(); + bench_type_func<realbuiltinvec<float, 2>, func_t>(); #endif // bench_type_func<realtestvec<float,2>, func_t>(); - bench_type_func<realvec<float,2>, func_t>(); + bench_type_func<realvec<float, 2>, func_t>(); #endif #ifdef VECMATHLIB_HAVE_VEC_FLOAT_4 - bench_type_func<realpseudovec<float,4>, func_t>(); + bench_type_func<realpseudovec<float, 4>, func_t>(); #ifdef __clang__ - bench_type_func<realbuiltinvec<float,4>, func_t>(); + bench_type_func<realbuiltinvec<float, 4>, func_t>(); #endif // bench_type_func<realtestvec<float,4>, func_t>(); - bench_type_func<realvec<float,4>, func_t>(); + bench_type_func<realvec<float, 4>, func_t>(); #endif #ifdef VECMATHLIB_HAVE_VEC_FLOAT_8 - bench_type_func<realpseudovec<float,8>, func_t>(); + bench_type_func<realpseudovec<float, 8>, func_t>(); #ifdef __clang__ - bench_type_func<realbuiltinvec<float,8>, func_t>(); + bench_type_func<realbuiltinvec<float, 8>, func_t>(); #endif // bench_type_func<realtestvec<float,8>, func_t>(); - bench_type_func<realvec<float,8>, func_t>(); + bench_type_func<realvec<float, 8>, func_t>(); #endif - - bench_type_func<realpseudovec<double,1>, func_t>(); + + bench_type_func<realpseudovec<double, 1>, func_t>(); #ifdef __clang__ - bench_type_func<realbuiltinvec<double,1>, func_t>(); + bench_type_func<realbuiltinvec<double, 1>, func_t>(); #endif - bench_type_func<realtestvec<double,1>, func_t>(); + bench_type_func<realtestvec<double, 1>, func_t>(); #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_1 - bench_type_func<realvec<double,1>, func_t>(); + bench_type_func<realvec<double, 1>, func_t>(); #endif #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_2 - bench_type_func<realpseudovec<double,2>, func_t>(); + bench_type_func<realpseudovec<double, 2>, func_t>(); #ifdef __clang__ - bench_type_func<realbuiltinvec<double,2>, func_t>(); + bench_type_func<realbuiltinvec<double, 2>, func_t>(); #endif // bench_type_func<realtestvec<double,2>, func_t>(); - bench_type_func<realvec<double,2>, func_t>(); + bench_type_func<realvec<double, 2>, func_t>(); #endif #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_4 - bench_type_func<realpseudovec<double,4>, func_t>(); + bench_type_func<realpseudovec<double, 4>, func_t>(); #ifdef __clang__ - bench_type_func<realbuiltinvec<double,4>, func_t>(); + bench_type_func<realbuiltinvec<double, 4>, func_t>(); #endif // bench_type_func<realtestvec<double,4>, func_t>(); - bench_type_func<realvec<double,4>, func_t>(); + bench_type_func<realvec<double, 4>, func_t>(); #endif #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_8 - bench_type_func<realpseudovec<double,8>, func_t>(); + bench_type_func<realpseudovec<double, 8>, func_t>(); #ifdef __clang__ - bench_type_func<realbuiltinvec<double,8>, func_t>(); + bench_type_func<realbuiltinvec<double, 8>, func_t>(); #endif // bench_type_func<realtestvec<double,8>, func_t>(); - bench_type_func<realvec<double,8>, func_t>(); + bench_type_func<realvec<double, 8>, func_t>(); #endif } - - -void bench() -{ +void bench() { bench_func<functor_nop>(); - + bench_func<functor_fneg>(); bench_func<functor_fadd>(); bench_func<functor_fsub>(); bench_func<functor_fmul>(); bench_func<functor_fdiv>(); - + bench_func<functor_acos>(); bench_func<functor_acosh>(); bench_func<functor_asin>(); @@ -436,10 +395,7 @@ void bench() bench_func<functor_trunc>(); } - - -int main(int argc, char** argv) -{ +int main(int argc, char **argv) { cout << "Benchmarking math functions:\n"; bench(); return 0; @@ -7,20 +7,18 @@ using namespace std; using namespace vecmathlib; - - -int main(int argc, char** argv) -{ +int main(int argc, char **argv) { // Declare a double precision vector with an architecture-dependent // number of elements float64_vec x; // Set each element separately. This is inefficient and should be // avoided if possible, but we want to demonstrate it here anyway. - for (int i=0; i<float64_vec::size; ++i) x.set_elt(i, double(i)); + for (int i = 0; i < float64_vec::size; ++i) + x.set_elt(i, double(i)); float64_vec y = x + float64_vec(1.0); y = sqrt(y); float64_vec z = log(y); - + // Boolean vectors are closely related to either double or float // vectors, thus we need to make a distinction bool64_vec b = x < y; @@ -29,12 +27,12 @@ int main(int argc, char** argv) // corresponding to "float64_vec", and there is "int_vec" // correpsonding to "float_vec". int64_vec i = convert_int(y); - + cout << "x=" << x << "\n"; cout << "y=" << y << "\n"; cout << "z=" << z << "\n"; cout << "b=" << b << "\n"; cout << "i=" << i << "\n"; - + return 0; } diff --git a/example_float.cc b/example_float.cc index fed91c7..4feea0e 100644 --- a/example_float.cc +++ b/example_float.cc @@ -7,20 +7,18 @@ using namespace std; using namespace vecmathlib; - - -int main(int argc, char** argv) -{ +int main(int argc, char **argv) { // Declare a float precision vector with an architecture-dependent // number of elements float32_vec x; // Set each element separately. This is inefficient and should be // avoided if possible, but we want to demonstrate it here anyway. - for (int i=0; i<float32_vec::size; ++i) x.set_elt(i, float(i)); + for (int i = 0; i < float32_vec::size; ++i) + x.set_elt(i, float(i)); float32_vec y = x + float32_vec(1.0); y = sqrt(y); float32_vec z = log(y); - + // Boolean vectors are closely related to either float or float // vectors, thus we need to make a distinction bool32_vec b = x < y; @@ -29,12 +27,12 @@ int main(int argc, char** argv) // corresponding to "float32_vec", and there is "int_vec" // correpsonding to "float32_vec". int32_vec i = convert_int(y); - + cout << "x=" << x << "\n"; cout << "y=" << y << "\n"; cout << "z=" << z << "\n"; cout << "b=" << b << "\n"; cout << "i=" << i << "\n"; - + return 0; } diff --git a/floatbuiltins.h b/floatbuiltins.h index ee076a2..a7dd6f1 100644 --- a/floatbuiltins.h +++ b/floatbuiltins.h @@ -6,323 +6,383 @@ #if defined __clang__ namespace vecmathlib { - - inline char builtin_abs(char x) { return __builtin_abs(x); } - inline short builtin_abs(short x) { return __builtin_abs(x); } - inline int builtin_abs(int x) { return __builtin_abs(x); } - inline long builtin_abs(long x) { return __builtin_labs(x); } + +inline char builtin_abs(char x) { return __builtin_abs(x); } +inline short builtin_abs(short x) { return __builtin_abs(x); } +inline int builtin_abs(int x) { return __builtin_abs(x); } +inline long builtin_abs(long x) { return __builtin_labs(x); } #if __SIZEOF_LONG_LONG__ - inline long long builtin_abs(long long x) { return __builtin_llabs(x); } +inline long long builtin_abs(long long x) { return __builtin_llabs(x); } #endif - - inline unsigned char builtin_clz(unsigned char x) { return __builtin_clzs(x) - CHAR_BIT * (sizeof(unsigned short) - sizeof(unsigned char)); } - inline unsigned short builtin_clz(unsigned short x) { return __builtin_clzs(x); } - inline unsigned int builtin_clz(unsigned int x) { return __builtin_clz(x); } - inline unsigned long builtin_clz(unsigned long x) { return __builtin_clzl(x); } + +inline unsigned char builtin_clz(unsigned char x) { + return __builtin_clzs(x) - + CHAR_BIT * (sizeof(unsigned short) - sizeof(unsigned char)); +} +inline unsigned short builtin_clz(unsigned short x) { + return __builtin_clzs(x); +} +inline unsigned int builtin_clz(unsigned int x) { return __builtin_clz(x); } +inline unsigned long builtin_clz(unsigned long x) { return __builtin_clzl(x); } #if __SIZEOF_LONG_LONG__ - inline unsigned long long builtin_clz(unsigned long long x) { return __builtin_clzll(x); } +inline unsigned long long builtin_clz(unsigned long long x) { + return __builtin_clzll(x); +} #endif - - inline unsigned char builtin_popcount(unsigned char x) { return __builtin_popcount(x); } - inline unsigned short builtin_popcount(unsigned short x) { return __builtin_popcount(x); } - inline unsigned int builtin_popcount(unsigned int x) { return __builtin_popcount(x); } - inline unsigned long builtin_popcount(unsigned long x) { return __builtin_popcountl(x); } + +inline unsigned char builtin_popcount(unsigned char x) { + return __builtin_popcount(x); +} +inline unsigned short builtin_popcount(unsigned short x) { + return __builtin_popcount(x); +} +inline unsigned int builtin_popcount(unsigned int x) { + return __builtin_popcount(x); +} +inline unsigned long builtin_popcount(unsigned long x) { + return __builtin_popcountl(x); +} #if __SIZEOF_LONG_LONG__ - inline unsigned long long builtin_popcount(unsigned long long x) { return __builtin_popcountll(x); } +inline unsigned long long builtin_popcount(unsigned long long x) { + return __builtin_popcountll(x); +} #endif - - - - inline float builtin_acos(float x) { return __builtin_acosf(x); } - inline double builtin_acos(double x) { return __builtin_acos(x); } + +inline float builtin_acos(float x) { return __builtin_acosf(x); } +inline double builtin_acos(double x) { return __builtin_acos(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_acos(long double x) { return __builtin_acosl(x); } +inline long double builtin_acos(long double x) { return __builtin_acosl(x); } #endif - - inline float builtin_acosh(float x) { return __builtin_acoshf(x); } - inline double builtin_acosh(double x) { return __builtin_acosh(x); } + +inline float builtin_acosh(float x) { return __builtin_acoshf(x); } +inline double builtin_acosh(double x) { return __builtin_acosh(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_acosh(long double x) { return __builtin_acoshl(x); } +inline long double builtin_acosh(long double x) { return __builtin_acoshl(x); } #endif - - inline float builtin_asin(float x) { return __builtin_asinf(x); } - inline double builtin_asin(double x) { return __builtin_asin(x); } + +inline float builtin_asin(float x) { return __builtin_asinf(x); } +inline double builtin_asin(double x) { return __builtin_asin(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_asin(long double x) { return __builtin_asinl(x); } +inline long double builtin_asin(long double x) { return __builtin_asinl(x); } #endif - - inline float builtin_asinh(float x) { return __builtin_asinhf(x); } - inline double builtin_asinh(double x) { return __builtin_asinh(x); } + +inline float builtin_asinh(float x) { return __builtin_asinhf(x); } +inline double builtin_asinh(double x) { return __builtin_asinh(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_asinh(long double x) { return __builtin_asinhl(x); } +inline long double builtin_asinh(long double x) { return __builtin_asinhl(x); } #endif - - inline float builtin_atan(float x) { return __builtin_atanf(x); } - inline double builtin_atan(double x) { return __builtin_atan(x); } + +inline float builtin_atan(float x) { return __builtin_atanf(x); } +inline double builtin_atan(double x) { return __builtin_atan(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_atan(long double x) { return __builtin_atanl(x); } +inline long double builtin_atan(long double x) { return __builtin_atanl(x); } #endif - - inline float builtin_atan2(float x, float y) { return __builtin_atan2f(x, y); } - inline double builtin_atan2(double x, double y) { return __builtin_atan2(x, y); } + +inline float builtin_atan2(float x, float y) { return __builtin_atan2f(x, y); } +inline double builtin_atan2(double x, double y) { + return __builtin_atan2(x, y); +} #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_atan2(long double x, long double y) { return __builtin_atan2l(x, y); } +inline long double builtin_atan2(long double x, long double y) { + return __builtin_atan2l(x, y); +} #endif - - inline float builtin_atanh(float x) { return __builtin_atanhf(x); } - inline double builtin_atanh(double x) { return __builtin_atanh(x); } + +inline float builtin_atanh(float x) { return __builtin_atanhf(x); } +inline double builtin_atanh(double x) { return __builtin_atanh(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_atanh(long double x) { return __builtin_atanhl(x); } +inline long double builtin_atanh(long double x) { return __builtin_atanhl(x); } #endif - - inline float builtin_cbrt(float x) { return __builtin_cbrtf(x); } - inline double builtin_cbrt(double x) { return __builtin_cbrt(x); } + +inline float builtin_cbrt(float x) { return __builtin_cbrtf(x); } +inline double builtin_cbrt(double x) { return __builtin_cbrt(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_cbrt(long double x) { return __builtin_cbrtl(x); } +inline long double builtin_cbrt(long double x) { return __builtin_cbrtl(x); } #endif - - inline float builtin_ceil(float x) { return __builtin_ceilf(x); } - inline double builtin_ceil(double x) { return __builtin_ceil(x); } + +inline float builtin_ceil(float x) { return __builtin_ceilf(x); } +inline double builtin_ceil(double x) { return __builtin_ceil(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_ceil(long double x) { return __builtin_ceill(x); } +inline long double builtin_ceil(long double x) { return __builtin_ceill(x); } #endif - - inline float builtin_copysign(float x, float y) { return __builtin_copysignf(x, y); } - inline double builtin_copysign(double x, double y) { return __builtin_copysign(x, y); } + +inline float builtin_copysign(float x, float y) { + return __builtin_copysignf(x, y); +} +inline double builtin_copysign(double x, double y) { + return __builtin_copysign(x, y); +} #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_copysign(long double x, long double y) { return __builtin_copysignl(x, y); } +inline long double builtin_copysign(long double x, long double y) { + return __builtin_copysignl(x, y); +} #endif - inline float builtin_cos(float x) { return __builtin_cosf(x); } - inline double builtin_cos(double x) { return __builtin_cos(x); } +inline float builtin_cos(float x) { return __builtin_cosf(x); } +inline double builtin_cos(double x) { return __builtin_cos(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_cos(long double x) { return __builtin_cosl(x); } +inline long double builtin_cos(long double x) { return __builtin_cosl(x); } #endif - - inline float builtin_cosh(float x) { return __builtin_coshf(x); } - inline double builtin_cosh(double x) { return __builtin_cosh(x); } + +inline float builtin_cosh(float x) { return __builtin_coshf(x); } +inline double builtin_cosh(double x) { return __builtin_cosh(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_cosh(long double x) { return __builtin_coshl(x); } +inline long double builtin_cosh(long double x) { return __builtin_coshl(x); } #endif - inline float builtin_exp(float x) { return __builtin_expf(x); } - inline double builtin_exp(double x) { return __builtin_exp(x); } +inline float builtin_exp(float x) { return __builtin_expf(x); } +inline double builtin_exp(double x) { return __builtin_exp(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_exp(long double x) { return __builtin_expl(x); } +inline long double builtin_exp(long double x) { return __builtin_expl(x); } #endif - - inline float builtin_exp2(float x) { return __builtin_exp2f(x); } - inline double builtin_exp2(double x) { return __builtin_exp2(x); } + +inline float builtin_exp2(float x) { return __builtin_exp2f(x); } +inline double builtin_exp2(double x) { return __builtin_exp2(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_exp2(long double x) { return __builtin_exp2l(x); } +inline long double builtin_exp2(long double x) { return __builtin_exp2l(x); } #endif - inline float builtin_expm1(float x) { return __builtin_expm1f(x); } - inline double builtin_expm1(double x) { return __builtin_expm1(x); } +inline float builtin_expm1(float x) { return __builtin_expm1f(x); } +inline double builtin_expm1(double x) { return __builtin_expm1(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_expm1(long double x) { return __builtin_expm1l(x); } +inline long double builtin_expm1(long double x) { return __builtin_expm1l(x); } #endif - inline float builtin_fabs(float x) { return __builtin_fabsf(x); } - inline double builtin_fabs(double x) { return __builtin_fabs(x); } +inline float builtin_fabs(float x) { return __builtin_fabsf(x); } +inline double builtin_fabs(double x) { return __builtin_fabs(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_fabs(long double x) { return __builtin_fabsl(x); } +inline long double builtin_fabs(long double x) { return __builtin_fabsl(x); } #endif - - inline float builtin_fdim(float x, float y) { return __builtin_fdimf(x, y); } - inline double builtin_fdim(double x, double y) { return __builtin_fdim(x, y); } + +inline float builtin_fdim(float x, float y) { return __builtin_fdimf(x, y); } +inline double builtin_fdim(double x, double y) { return __builtin_fdim(x, y); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_fdim(long double x, long double y) { return __builtin_fdiml(x, y); } +inline long double builtin_fdim(long double x, long double y) { + return __builtin_fdiml(x, y); +} #endif - - inline float builtin_floor(float x) { return __builtin_floorf(x); } - inline double builtin_floor(double x) { return __builtin_floor(x); } + +inline float builtin_floor(float x) { return __builtin_floorf(x); } +inline double builtin_floor(double x) { return __builtin_floor(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_floor(long double x) { return __builtin_floorl(x); } +inline long double builtin_floor(long double x) { return __builtin_floorl(x); } #endif - - inline float builtin_fma(float x, float y, float z) { return __builtin_fmaf(x, y, z); } - inline double builtin_fma(double x, double y, double z) { return __builtin_fma(x, y, z); } + +inline float builtin_fma(float x, float y, float z) { + return __builtin_fmaf(x, y, z); +} +inline double builtin_fma(double x, double y, double z) { + return __builtin_fma(x, y, z); +} #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_fma(long double x, long double y, long double z) { return __builtin_fmal(x, y, z); } +inline long double builtin_fma(long double x, long double y, long double z) { + return __builtin_fmal(x, y, z); +} #endif - - inline float builtin_fmax(float x, float y) { return __builtin_fmaxf(x, y); } - inline double builtin_fmax(double x, double y) { return __builtin_fmax(x, y); } + +inline float builtin_fmax(float x, float y) { return __builtin_fmaxf(x, y); } +inline double builtin_fmax(double x, double y) { return __builtin_fmax(x, y); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_fmax(long double x, long double y) { return __builtin_fmaxl(x, y); } +inline long double builtin_fmax(long double x, long double y) { + return __builtin_fmaxl(x, y); +} #endif - - inline float builtin_fmin(float x, float y) { return __builtin_fminf(x, y); } - inline double builtin_fmin(double x, double y) { return __builtin_fmin(x, y); } + +inline float builtin_fmin(float x, float y) { return __builtin_fminf(x, y); } +inline double builtin_fmin(double x, double y) { return __builtin_fmin(x, y); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_fmin(long double x, long double y) { return __builtin_fminl(x, y); } +inline long double builtin_fmin(long double x, long double y) { + return __builtin_fminl(x, y); +} #endif - - inline float builtin_fmod(float x, float y) { return __builtin_fmodf(x, y); } - inline double builtin_fmod(double x, double y) { return __builtin_fmod(x, y); } + +inline float builtin_fmod(float x, float y) { return __builtin_fmodf(x, y); } +inline double builtin_fmod(double x, double y) { return __builtin_fmod(x, y); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_fmod(long double x, long double y) { return __builtin_fmodl(x, y); } +inline long double builtin_fmod(long double x, long double y) { + return __builtin_fmodl(x, y); +} #endif - - inline float builtin_frexp(float x, int* r) { return __builtin_frexpf(x, r); } - inline double builtin_frexp(double x, int* r) { return __builtin_frexp(x, r); } + +inline float builtin_frexp(float x, int *r) { return __builtin_frexpf(x, r); } +inline double builtin_frexp(double x, int *r) { return __builtin_frexp(x, r); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_frexp(long double x, int* r) { return __builtin_frexpl(x, r); } +inline long double builtin_frexp(long double x, int *r) { + return __builtin_frexpl(x, r); +} #endif - - inline float builtin_hypot(float x, float y) { return __builtin_hypotf(x, y); } - inline double builtin_hypot(double x, double y) { return __builtin_hypot(x, y); } + +inline float builtin_hypot(float x, float y) { return __builtin_hypotf(x, y); } +inline double builtin_hypot(double x, double y) { + return __builtin_hypot(x, y); +} #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_hypot(long double x, long double y) { return __builtin_hypotl(x, y); } +inline long double builtin_hypot(long double x, long double y) { + return __builtin_hypotl(x, y); +} #endif - - inline int builtin_ilogb(float x) { return __builtin_ilogbf(x); } - inline int builtin_ilogb(double x) { return __builtin_ilogb(x); } + +inline int builtin_ilogb(float x) { return __builtin_ilogbf(x); } +inline int builtin_ilogb(double x) { return __builtin_ilogb(x); } #if __SIZEOF_LONG_DOUBLE__ - inline int builtin_ilogb(long double x) { return __builtin_ilogbl(x); } +inline int builtin_ilogb(long double x) { return __builtin_ilogbl(x); } #endif - - inline int builtin_isfinite(float x) { return __builtin_isfinite(x); } - inline int builtin_isfinite(double x) { return __builtin_isfinite(x); } + +inline int builtin_isfinite(float x) { return __builtin_isfinite(x); } +inline int builtin_isfinite(double x) { return __builtin_isfinite(x); } #if __SIZEOF_LONG_DOUBLE__ - inline int builtin_isfinite(long double x) { return __builtin_isfinite(x); } +inline int builtin_isfinite(long double x) { return __builtin_isfinite(x); } #endif - - inline int builtin_isinf(float x) { return __builtin_isinf(x); } - inline int builtin_isinf(double x) { return __builtin_isinf(x); } + +inline int builtin_isinf(float x) { return __builtin_isinf(x); } +inline int builtin_isinf(double x) { return __builtin_isinf(x); } #if __SIZEOF_LONG_DOUBLE__ - inline int builtin_isinf(long double x) { return __builtin_isinf(x); } +inline int builtin_isinf(long double x) { return __builtin_isinf(x); } #endif - - inline int builtin_isnan(float x) { return __builtin_isnan(x); } - inline int builtin_isnan(double x) { return __builtin_isnan(x); } + +inline int builtin_isnan(float x) { return __builtin_isnan(x); } +inline int builtin_isnan(double x) { return __builtin_isnan(x); } #if __SIZEOF_LONG_DOUBLE__ - inline int builtin_isnan(long double x) { return __builtin_isnan(x); } +inline int builtin_isnan(long double x) { return __builtin_isnan(x); } #endif - - inline int builtin_isnormal(float x) { return __builtin_isnormal(x); } - inline int builtin_isnormal(double x) { return __builtin_isnormal(x); } + +inline int builtin_isnormal(float x) { return __builtin_isnormal(x); } +inline int builtin_isnormal(double x) { return __builtin_isnormal(x); } #if __SIZEOF_LONG_DOUBLE__ - inline int builtin_isnormal(long double x) { return __builtin_isnormal(x); } +inline int builtin_isnormal(long double x) { return __builtin_isnormal(x); } #endif - - inline float builtin_ldexp(float x, int y) { return __builtin_ldexpf(x, y); } - inline double builtin_ldexp(double x, int y) { return __builtin_ldexp(x, y); } + +inline float builtin_ldexp(float x, int y) { return __builtin_ldexpf(x, y); } +inline double builtin_ldexp(double x, int y) { return __builtin_ldexp(x, y); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_ldexp(long double x, int y) { return __builtin_ldexpl(x, y); } +inline long double builtin_ldexp(long double x, int y) { + return __builtin_ldexpl(x, y); +} #endif - - inline long long builtin_llrint(float x) { return __builtin_llrintf(x); } - inline long long builtin_llrint(double x) { return __builtin_llrint(x); } + +inline long long builtin_llrint(float x) { return __builtin_llrintf(x); } +inline long long builtin_llrint(double x) { return __builtin_llrint(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long long builtin_llrint(long double x) { return __builtin_llrintl(x); } +inline long long builtin_llrint(long double x) { return __builtin_llrintl(x); } #endif - inline float builtin_log(float x) { return __builtin_logf(x); } - inline double builtin_log(double x) { return __builtin_log(x); } +inline float builtin_log(float x) { return __builtin_logf(x); } +inline double builtin_log(double x) { return __builtin_log(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_log(long double x) { return __builtin_logl(x); } +inline long double builtin_log(long double x) { return __builtin_logl(x); } #endif - inline float builtin_log10(float x) { return __builtin_log10f(x); } - inline double builtin_log10(double x) { return __builtin_log10(x); } +inline float builtin_log10(float x) { return __builtin_log10f(x); } +inline double builtin_log10(double x) { return __builtin_log10(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_log10(long double x) { return __builtin_log10l(x); } +inline long double builtin_log10(long double x) { return __builtin_log10l(x); } #endif - inline float builtin_log1p(float x) { return __builtin_log1pf(x); } - inline double builtin_log1p(double x) { return __builtin_log1p(x); } +inline float builtin_log1p(float x) { return __builtin_log1pf(x); } +inline double builtin_log1p(double x) { return __builtin_log1p(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_log1p(long double x) { return __builtin_log1pl(x); } +inline long double builtin_log1p(long double x) { return __builtin_log1pl(x); } #endif - inline float builtin_log2(float x) { return __builtin_log2f(x); } - inline double builtin_log2(double x) { return __builtin_log2(x); } +inline float builtin_log2(float x) { return __builtin_log2f(x); } +inline double builtin_log2(double x) { return __builtin_log2(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_log2(long double x) { return __builtin_log2l(x); } +inline long double builtin_log2(long double x) { return __builtin_log2l(x); } #endif - - inline long builtin_lrint(float x) { return __builtin_lrintf(x); } - inline long builtin_lrint(double x) { return __builtin_lrint(x); } + +inline long builtin_lrint(float x) { return __builtin_lrintf(x); } +inline long builtin_lrint(double x) { return __builtin_lrint(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long builtin_lrint(long double x) { return __builtin_lrintl(x); } +inline long builtin_lrint(long double x) { return __builtin_lrintl(x); } #endif - - inline float builtin_nextafter(float x, float y) { return __builtin_nextafterf(x, y); } - inline double builtin_nextafter(double x, double y) { return __builtin_nextafter(x, y); } + +inline float builtin_nextafter(float x, float y) { + return __builtin_nextafterf(x, y); +} +inline double builtin_nextafter(double x, double y) { + return __builtin_nextafter(x, y); +} #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_nextafter(long double x, long double y) { return __builtin_nextafterl(x, y); } +inline long double builtin_nextafter(long double x, long double y) { + return __builtin_nextafterl(x, y); +} #endif - - inline float builtin_pow(float x, float y) { return __builtin_powf(x, y); } - inline double builtin_pow(double x, double y) { return __builtin_pow(x, y); } + +inline float builtin_pow(float x, float y) { return __builtin_powf(x, y); } +inline double builtin_pow(double x, double y) { return __builtin_pow(x, y); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_pow(long double x, long double y) { return __builtin_powl(x, y); } +inline long double builtin_pow(long double x, long double y) { + return __builtin_powl(x, y); +} #endif - - inline float builtin_remainder(float x, float y) { return __builtin_remainderf(x, y); } - inline double builtin_remainder(double x, double y) { return __builtin_remainder(x, y); } + +inline float builtin_remainder(float x, float y) { + return __builtin_remainderf(x, y); +} +inline double builtin_remainder(double x, double y) { + return __builtin_remainder(x, y); +} #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_remainder(long double x, long double y) { return __builtin_remainderl(x, y); } +inline long double builtin_remainder(long double x, long double y) { + return __builtin_remainderl(x, y); +} #endif - inline float builtin_rint(float x) { return __builtin_rintf(x); } - inline double builtin_rint(double x) { return __builtin_rint(x); } +inline float builtin_rint(float x) { return __builtin_rintf(x); } +inline double builtin_rint(double x) { return __builtin_rint(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_rint(long double x) { return __builtin_rintl(x); } +inline long double builtin_rint(long double x) { return __builtin_rintl(x); } #endif - inline float builtin_round(float x) { return __builtin_roundf(x); } - inline double builtin_round(double x) { return __builtin_round(x); } +inline float builtin_round(float x) { return __builtin_roundf(x); } +inline double builtin_round(double x) { return __builtin_round(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_round(long double x) { return __builtin_roundl(x); } +inline long double builtin_round(long double x) { return __builtin_roundl(x); } #endif - - inline int builtin_signbit(float x) { return __builtin_signbitf(x); } - inline int builtin_signbit(double x) { return __builtin_signbit(x); } + +inline int builtin_signbit(float x) { return __builtin_signbitf(x); } +inline int builtin_signbit(double x) { return __builtin_signbit(x); } #if __SIZEOF_LONG_DOUBLE__ - inline int builtin_signbit(long double x) { return __builtin_signbitl(x); } +inline int builtin_signbit(long double x) { return __builtin_signbitl(x); } #endif - inline float builtin_sin(float x) { return __builtin_sinf(x); } - inline double builtin_sin(double x) { return __builtin_sin(x); } +inline float builtin_sin(float x) { return __builtin_sinf(x); } +inline double builtin_sin(double x) { return __builtin_sin(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_sin(long double x) { return __builtin_sinl(x); } +inline long double builtin_sin(long double x) { return __builtin_sinl(x); } #endif - - inline float builtin_sinh(float x) { return __builtin_sinhf(x); } - inline double builtin_sinh(double x) { return __builtin_sinh(x); } + +inline float builtin_sinh(float x) { return __builtin_sinhf(x); } +inline double builtin_sinh(double x) { return __builtin_sinh(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_sinh(long double x) { return __builtin_sinhl(x); } +inline long double builtin_sinh(long double x) { return __builtin_sinhl(x); } #endif - - inline float builtin_sqrt(float x) { return __builtin_sqrtf(x); } - inline double builtin_sqrt(double x) { return __builtin_sqrt(x); } + +inline float builtin_sqrt(float x) { return __builtin_sqrtf(x); } +inline double builtin_sqrt(double x) { return __builtin_sqrt(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_sqrt(long double x) { return __builtin_sqrtl(x); } +inline long double builtin_sqrt(long double x) { return __builtin_sqrtl(x); } #endif - inline float builtin_tan(float x) { return __builtin_tanf(x); } - inline double builtin_tan(double x) { return __builtin_tan(x); } +inline float builtin_tan(float x) { return __builtin_tanf(x); } +inline double builtin_tan(double x) { return __builtin_tan(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_tan(long double x) { return __builtin_tanl(x); } +inline long double builtin_tan(long double x) { return __builtin_tanl(x); } #endif - - inline float builtin_tanh(float x) { return __builtin_tanhf(x); } - inline double builtin_tanh(double x) { return __builtin_tanh(x); } + +inline float builtin_tanh(float x) { return __builtin_tanhf(x); } +inline double builtin_tanh(double x) { return __builtin_tanh(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_tanh(long double x) { return __builtin_tanhl(x); } +inline long double builtin_tanh(long double x) { return __builtin_tanhl(x); } #endif - - inline float builtin_trunc(float x) { return __builtin_truncf(x); } - inline double builtin_trunc(double x) { return __builtin_trunc(x); } + +inline float builtin_trunc(float x) { return __builtin_truncf(x); } +inline double builtin_trunc(double x) { return __builtin_trunc(x); } #if __SIZEOF_LONG_DOUBLE__ - inline long double builtin_trunc(long double x) { return __builtin_truncl(x); } +inline long double builtin_trunc(long double x) { return __builtin_truncl(x); } #endif - } #endif -#endif // #ifndef FLOATBUILTINS_H +#endif // #ifndef FLOATBUILTINS_H diff --git a/floatprops.h b/floatprops.h index f1c39a2..c7a3b7f 100644 --- a/floatprops.h +++ b/floatprops.h @@ -10,310 +10,279 @@ #include <cstring> #include <limits> +namespace vecmathlib { +// A structure describing various properties of a floating point +// type. Most properties are already described in numeric_limits, so +// we inherit it. +template <typename real_t> struct floatprops { + // Some interesting properties are: + // min + // max + // digits + // epsilon + // min_exponent + // max_exponent + // infinity + // quiet_NaN +}; -namespace vecmathlib { - - // A structure describing various properties of a floating point - // type. Most properties are already described in numeric_limits, so - // we inherit it. - template<typename real_t> - struct floatprops { - // Some interesting properties are: - // min - // max - // digits - // epsilon - // min_exponent - // max_exponent - // infinity - // quiet_NaN - }; - - - - // Properties of fp8 - template<> - struct floatprops<fp8> { - typedef fp8 real_t; - typedef vml_std::int8_t int_t; - typedef vml_std::uint8_t uint_t; - - static char const* name() { return "fp8"; } - - // Definitions that might come from numeric_limits<> instead: - static real_t min() { __builtin_unreachable(); } - static real_t max() { __builtin_unreachable(); } - static int const digits = 4; - static real_t epsilon() { __builtin_unreachable(); } - static int const min_exponent = -6; - static int const max_exponent = 7; - static real_t infinity() { __builtin_unreachable(); } - static real_t quiet_NaN() { __builtin_unreachable(); } - - // Ensure the sizes match - static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size"); - static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size"); - - // Number of bits in internal representation - static int const bits = 8 * sizeof(real_t); - static int const mantissa_bits = digits - 1; - static int const signbit_bits = 1; - static int const exponent_bits = bits - mantissa_bits - signbit_bits; - static int const exponent_offset = 2 - min_exponent; - static_assert(mantissa_bits + exponent_bits + signbit_bits == bits, - "error in bit counts"); - static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1; - static uint_t const exponent_mask = - ((uint_t(1) << exponent_bits) - 1) << mantissa_bits; - static uint_t const signbit_mask = uint_t(1) << (bits-1); - static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0), - "error in masks"); - static_assert((mantissa_mask | exponent_mask | signbit_mask) == - uint_t(~uint_t(0)), - "error in masks"); - - // Re-interpret bit patterns - static real_t as_float(int_t x) - { - real_t res; - std::memcpy(&res, &x, sizeof res); - return res; - } - static int_t as_int(real_t x) - { - int_t res; - std::memcpy(&res, &x, sizeof res); - return res; - } - static int_t replicate_byte(unsigned char byte) - { - int_t res; - std::memset(&res, byte, sizeof res); - return res; - } - - // Convert values (truncate) - static real_t convert_float(int_t x) { __builtin_unreachable(); } - static int_t convert_int(real_t x) { __builtin_unreachable(); } - }; - - - - // Properties of fp16 - template<> - struct floatprops<fp16> { - typedef fp16 real_t; - typedef vml_std::int16_t int_t; - typedef vml_std::uint16_t uint_t; - - static char const* name() { return "fp16"; } - - // Definitions that might come from numeric_limits<> instead: - static real_t min() { __builtin_unreachable(); } - static real_t max() { __builtin_unreachable(); } - static int const digits = 11; - static real_t epsilon() { __builtin_unreachable(); } - static int const min_exponent = -14; - static int const max_exponent = 15; - static real_t infinity() { __builtin_unreachable(); } - static real_t quiet_NaN() { __builtin_unreachable(); } - - // Ensure the sizes match - static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size"); - static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size"); - - // Number of bits in internal representation - static int const bits = 8 * sizeof(real_t); - static int const mantissa_bits = digits - 1; - static int const signbit_bits = 1; - static int const exponent_bits = bits - mantissa_bits - signbit_bits; - static int const exponent_offset = 2 - min_exponent; - static_assert(mantissa_bits + exponent_bits + signbit_bits == bits, - "error in bit counts"); - static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1; - static uint_t const exponent_mask = - ((uint_t(1) << exponent_bits) - 1) << mantissa_bits; - static uint_t const signbit_mask = uint_t(1) << (bits-1); - static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0), - "error in masks"); - static_assert((mantissa_mask | exponent_mask | signbit_mask) == - uint_t(~uint_t(0)), - "error in masks"); - - // Re-interpret bit patterns - static real_t as_float(int_t x) - { - real_t res; - std::memcpy(&res, &x, sizeof res); - return res; - } - static int_t as_int(real_t x) - { - int_t res; - std::memcpy(&res, &x, sizeof res); - return res; - } - static int_t replicate_byte(unsigned char byte) - { - int_t res; - std::memset(&res, byte, sizeof res); - return res; - } - - // Convert values (truncate) - static real_t convert_float(int_t x) { __builtin_unreachable(); } - static int_t convert_int(real_t x) { __builtin_unreachable(); } - }; - - - - // Properties of float - template<> - struct floatprops<float>: std::numeric_limits<float> { - typedef float real_t; - typedef vml_std::int32_t int_t; - typedef vml_std::uint32_t uint_t; - - static char const* name() { return "float"; } - - // Ensure the internal representation is what we expect - static_assert(is_signed, "real_t is not signed"); - static_assert(radix==2, "real_t is not binary"); - - // Ensure the sizes match - static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size"); - static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size"); - - // Number of bits in internal representation - static int const bits = 8 * sizeof(real_t); - static int const mantissa_bits = digits - 1; - static int const signbit_bits = 1; - static int const exponent_bits = bits - mantissa_bits - signbit_bits; - static int const exponent_offset = 2 - min_exponent; - static_assert(mantissa_bits + exponent_bits + signbit_bits == bits, - "error in bit counts"); - static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1; - static uint_t const exponent_mask = - ((uint_t(1) << exponent_bits) - 1) << mantissa_bits; - static uint_t const signbit_mask = uint_t(1) << (bits-1); - static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0), - "error in masks"); - static_assert((mantissa_mask | exponent_mask | signbit_mask) == - uint_t(~uint_t(0)), - "error in masks"); - - // Re-interpret bit patterns - static real_t as_float(int_t x) - { - real_t res; - std::memcpy(&res, &x, sizeof res); - return res; - } - static int_t as_int(real_t x) - { - int_t res; - std::memcpy(&res, &x, sizeof res); - return res; - } - static int_t replicate_byte(unsigned char byte) - { - int_t res; - std::memset(&res, byte, sizeof res); - return res; - } - - // Convert values (truncate) - static real_t convert_float(int_t x) { return real_t(x); } - static int_t convert_int(real_t x) { return int_t(x); } - }; - - - - // Properties of double - template<> - struct floatprops<double>: std::numeric_limits<double> { - typedef double real_t; - typedef vml_std::int64_t int_t; - typedef vml_std::uint64_t uint_t; - - static char const* name() { return "double"; } - - // Ensure the internal representation is what we expect - static_assert(is_signed, "real_t is not signed"); - static_assert(radix==2, "real_t is not binary"); - - // Ensure the sizes match - static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size"); - static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size"); - - // Number of bits in internal representation - static int const bits = 8 * sizeof(real_t); - static int const mantissa_bits = digits - 1; - static int const signbit_bits = 1; - static int const exponent_bits = bits - mantissa_bits - signbit_bits; - static int const exponent_offset = 2 - min_exponent; - static_assert(mantissa_bits + exponent_bits + signbit_bits == bits, - "error in bit counts"); - static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1; - static uint_t const exponent_mask = - ((uint_t(1) << exponent_bits) - 1) << mantissa_bits; - static uint_t const signbit_mask = uint_t(1) << (bits-1); - static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0), - "error in masks"); - static_assert((mantissa_mask | exponent_mask | signbit_mask) == - uint_t(~uint_t(0)), - "error in masks"); - - // Re-interpret bit patterns - static real_t as_float(int_t x) - { - real_t res; - std::memcpy(&res, &x, sizeof res); - return res; - } - static int_t as_int(real_t x) - { - int_t res; - std::memcpy(&res, &x, sizeof res); - return res; - } - static int_t replicate_byte(unsigned char byte) - { - int_t res; - std::memset(&res, byte, sizeof res); - return res; - } - - // Convert values (truncate) - static real_t convert_float(int_t x) { return real_t(x); } - static int_t convert_int(real_t x) { return int_t(x); } - }; - - - - // We are adding the (unused) type RV here to avoid name mangling - // problems. On some systems, the vector size does not enter into - // the mangled name (!), leading to duplicate function definitions. - template<typename RV, typename V, typename E> - E get_elt(const V& v, const int n) - { - const size_t s = sizeof(E); - E e; - // assert(n>=0 and s*n<sizeof(V)); - std::memcpy(&e, &((const char*)&v)[s*n], s); - return e; +// Properties of fp8 +template <> struct floatprops<fp8> { + typedef fp8 real_t; + typedef vml_std::int8_t int_t; + typedef vml_std::uint8_t uint_t; + + static char const *name() { return "fp8"; } + + // Definitions that might come from numeric_limits<> instead: + static real_t min() { __builtin_unreachable(); } + static real_t max() { __builtin_unreachable(); } + static int const digits = 4; + static real_t epsilon() { __builtin_unreachable(); } + static int const min_exponent = -6; + static int const max_exponent = 7; + static real_t infinity() { __builtin_unreachable(); } + static real_t quiet_NaN() { __builtin_unreachable(); } + + // Ensure the sizes match + static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size"); + static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size"); + + // Number of bits in internal representation + static int const bits = 8 * sizeof(real_t); + static int const mantissa_bits = digits - 1; + static int const signbit_bits = 1; + static int const exponent_bits = bits - mantissa_bits - signbit_bits; + static int const exponent_offset = 2 - min_exponent; + static_assert(mantissa_bits + exponent_bits + signbit_bits == bits, + "error in bit counts"); + static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1; + static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1) + << mantissa_bits; + static uint_t const signbit_mask = uint_t(1) << (bits - 1); + static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0), + "error in masks"); + static_assert((mantissa_mask | exponent_mask | signbit_mask) == + uint_t(~uint_t(0)), + "error in masks"); + + // Re-interpret bit patterns + static real_t as_float(int_t x) { + real_t res; + std::memcpy(&res, &x, sizeof res); + return res; + } + static int_t as_int(real_t x) { + int_t res; + std::memcpy(&res, &x, sizeof res); + return res; + } + static int_t replicate_byte(unsigned char byte) { + int_t res; + std::memset(&res, byte, sizeof res); + return res; + } + + // Convert values (truncate) + static real_t convert_float(int_t x) { __builtin_unreachable(); } + static int_t convert_int(real_t x) { __builtin_unreachable(); } +}; + +// Properties of fp16 +template <> struct floatprops<fp16> { + typedef fp16 real_t; + typedef vml_std::int16_t int_t; + typedef vml_std::uint16_t uint_t; + + static char const *name() { return "fp16"; } + + // Definitions that might come from numeric_limits<> instead: + static real_t min() { __builtin_unreachable(); } + static real_t max() { __builtin_unreachable(); } + static int const digits = 11; + static real_t epsilon() { __builtin_unreachable(); } + static int const min_exponent = -14; + static int const max_exponent = 15; + static real_t infinity() { __builtin_unreachable(); } + static real_t quiet_NaN() { __builtin_unreachable(); } + + // Ensure the sizes match + static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size"); + static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size"); + + // Number of bits in internal representation + static int const bits = 8 * sizeof(real_t); + static int const mantissa_bits = digits - 1; + static int const signbit_bits = 1; + static int const exponent_bits = bits - mantissa_bits - signbit_bits; + static int const exponent_offset = 2 - min_exponent; + static_assert(mantissa_bits + exponent_bits + signbit_bits == bits, + "error in bit counts"); + static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1; + static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1) + << mantissa_bits; + static uint_t const signbit_mask = uint_t(1) << (bits - 1); + static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0), + "error in masks"); + static_assert((mantissa_mask | exponent_mask | signbit_mask) == + uint_t(~uint_t(0)), + "error in masks"); + + // Re-interpret bit patterns + static real_t as_float(int_t x) { + real_t res; + std::memcpy(&res, &x, sizeof res); + return res; + } + static int_t as_int(real_t x) { + int_t res; + std::memcpy(&res, &x, sizeof res); + return res; + } + static int_t replicate_byte(unsigned char byte) { + int_t res; + std::memset(&res, byte, sizeof res); + return res; + } + + // Convert values (truncate) + static real_t convert_float(int_t x) { __builtin_unreachable(); } + static int_t convert_int(real_t x) { __builtin_unreachable(); } +}; + +// Properties of float +template <> struct floatprops<float> : std::numeric_limits<float> { + typedef float real_t; + typedef vml_std::int32_t int_t; + typedef vml_std::uint32_t uint_t; + + static char const *name() { return "float"; } + + // Ensure the internal representation is what we expect + static_assert(is_signed, "real_t is not signed"); + static_assert(radix == 2, "real_t is not binary"); + + // Ensure the sizes match + static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size"); + static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size"); + + // Number of bits in internal representation + static int const bits = 8 * sizeof(real_t); + static int const mantissa_bits = digits - 1; + static int const signbit_bits = 1; + static int const exponent_bits = bits - mantissa_bits - signbit_bits; + static int const exponent_offset = 2 - min_exponent; + static_assert(mantissa_bits + exponent_bits + signbit_bits == bits, + "error in bit counts"); + static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1; + static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1) + << mantissa_bits; + static uint_t const signbit_mask = uint_t(1) << (bits - 1); + static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0), + "error in masks"); + static_assert((mantissa_mask | exponent_mask | signbit_mask) == + uint_t(~uint_t(0)), + "error in masks"); + + // Re-interpret bit patterns + static real_t as_float(int_t x) { + real_t res; + std::memcpy(&res, &x, sizeof res); + return res; } - - template<typename RV, typename V, typename E> - V& set_elt(V& v, const int n, const E e) - { - const size_t s = sizeof(E); - // assert(n>=0 and s*n<sizeof(V)); - std::memcpy(&((char*)&v)[s*n], &e, s); - return v; + static int_t as_int(real_t x) { + int_t res; + std::memcpy(&res, &x, sizeof res); + return res; } - + static int_t replicate_byte(unsigned char byte) { + int_t res; + std::memset(&res, byte, sizeof res); + return res; + } + + // Convert values (truncate) + static real_t convert_float(int_t x) { return real_t(x); } + static int_t convert_int(real_t x) { return int_t(x); } +}; + +// Properties of double +template <> struct floatprops<double> : std::numeric_limits<double> { + typedef double real_t; + typedef vml_std::int64_t int_t; + typedef vml_std::uint64_t uint_t; + + static char const *name() { return "double"; } + + // Ensure the internal representation is what we expect + static_assert(is_signed, "real_t is not signed"); + static_assert(radix == 2, "real_t is not binary"); + + // Ensure the sizes match + static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size"); + static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size"); + + // Number of bits in internal representation + static int const bits = 8 * sizeof(real_t); + static int const mantissa_bits = digits - 1; + static int const signbit_bits = 1; + static int const exponent_bits = bits - mantissa_bits - signbit_bits; + static int const exponent_offset = 2 - min_exponent; + static_assert(mantissa_bits + exponent_bits + signbit_bits == bits, + "error in bit counts"); + static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1; + static uint_t const exponent_mask = ((uint_t(1) << exponent_bits) - 1) + << mantissa_bits; + static uint_t const signbit_mask = uint_t(1) << (bits - 1); + static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0), + "error in masks"); + static_assert((mantissa_mask | exponent_mask | signbit_mask) == + uint_t(~uint_t(0)), + "error in masks"); + + // Re-interpret bit patterns + static real_t as_float(int_t x) { + real_t res; + std::memcpy(&res, &x, sizeof res); + return res; + } + static int_t as_int(real_t x) { + int_t res; + std::memcpy(&res, &x, sizeof res); + return res; + } + static int_t replicate_byte(unsigned char byte) { + int_t res; + std::memset(&res, byte, sizeof res); + return res; + } + + // Convert values (truncate) + static real_t convert_float(int_t x) { return real_t(x); } + static int_t convert_int(real_t x) { return int_t(x); } +}; + +// We are adding the (unused) type RV here to avoid name mangling +// problems. On some systems, the vector size does not enter into +// the mangled name (!), leading to duplicate function definitions. +template <typename RV, typename V, typename E> +E get_elt(const V &v, const int n) { + const size_t s = sizeof(E); + E e; + // assert(n>=0 and s*n<sizeof(V)); + std::memcpy(&e, &((const char *)&v)[s * n], s); + return e; +} + +template <typename RV, typename V, typename E> +V &set_elt(V &v, const int n, const E e) { + const size_t s = sizeof(E); + // assert(n>=0 and s*n<sizeof(V)); + std::memcpy(&((char *)&v)[s * n], &e, s); + return v; +} + } // namespace vecmathlib -#endif // #ifndef FLOATPROPS_H +#endif // #ifndef FLOATPROPS_H diff --git a/floattypes.h b/floattypes.h index 5107af6..e037b95 100644 --- a/floattypes.h +++ b/floattypes.h @@ -3,20 +3,14 @@ #ifndef FLOATTYPES_H #define FLOATTYPES_H - - #include <cassert> #include <cstdlib> - - -#if ! (defined __clang__ || defined __gcc__) -# define __builtin_unreachable() (assert(0)) -# define __builtin_expect(expr, val) (expr) +#if !(defined __clang__ || defined __gcc__) +#define __builtin_unreachable() (assert(0)) +#define __builtin_expect(expr, val) (expr) #endif - - // We expect either 199711L or 201103L #if __cplusplus >= 201103L // C++11 is supported, use it @@ -25,11 +19,9 @@ #include <cstdint> namespace vml_std { - using namespace std; +using namespace std; } - - #else // C++11 is not supported, work around the missing pieces @@ -40,38 +32,35 @@ namespace vml_std { #include <stdint.h> #ifndef static_assert -# define VML_CONCAT2(x, y) x##y -# define VML_CONCAT(x, y) VML_CONCAT2(x, y) -# define static_assert(cond, msg) \ - typedef int VML_CONCAT(vml_static_assert_, __LINE__)[(cond) ? 1 : -1] \ - __attribute__((__unused__)) +#define VML_CONCAT2(x, y) x##y +#define VML_CONCAT(x, y) VML_CONCAT2(x, y) +#define static_assert(cond, msg) typedef int VML_CONCAT( \ + vml_static_assert_, __LINE__)[(cond) ? 1 : -1] __attribute__((__unused__)) #endif - - // Capture libc macros, then undefine them #ifndef isfinite -# error "isfinite is not a macro" +#error "isfinite is not a macro" #endif #ifndef isinf -# error "isinf is not a macro" +#error "isinf is not a macro" #endif #ifndef isnan -# error "isnan is not a macro" +#error "isnan is not a macro" #endif #ifndef isnormal -# error "isnormal is not a macro" +#error "isnormal is not a macro" #endif #ifndef signbit -# error "signbit is not a macro" +#error "signbit is not a macro" #endif namespace { - template<typename T> inline int libc_isfinite(T x) { return isfinite(x); } - template<typename T> inline int libc_isinf(T x) { return isinf(x); } - template<typename T> inline int libc_isnan(T x) { return isnan(x); } - template<typename T> inline int libc_isnormal(T x) { return isnormal(x); } - template<typename T> inline int libc_signbit(T x) { return signbit(x); } +template <typename T> inline int libc_isfinite(T x) { return isfinite(x); } +template <typename T> inline int libc_isinf(T x) { return isinf(x); } +template <typename T> inline int libc_isnan(T x) { return isnan(x); } +template <typename T> inline int libc_isnormal(T x) { return isnormal(x); } +template <typename T> inline int libc_signbit(T x) { return signbit(x); } } // Include this before undefining the macros below @@ -83,153 +72,146 @@ namespace { #undef isnormal #undef signbit - - namespace vml_std { - - // Make some type definitions from stdint.h available in std - typedef ::uint8_t uint8_t; - typedef ::int8_t int8_t; - typedef ::uint16_t uint16_t; - typedef ::int16_t int16_t; - typedef ::uint32_t uint32_t; - typedef ::int32_t int32_t; + +// Make some type definitions from stdint.h available in std +typedef ::uint8_t uint8_t; +typedef ::int8_t int8_t; +typedef ::uint16_t uint16_t; +typedef ::int16_t int16_t; +typedef ::uint32_t uint32_t; +typedef ::int32_t int32_t; #if __SIZEOF_LONG__ == 8 - // Even if both "long" and "long long" have the same size, they are - // still different types. In many cases, it is then preferable to - // use "long" instead of "long long". - typedef unsigned long uint64_t; - typedef long int64_t; +// Even if both "long" and "long long" have the same size, they are +// still different types. In many cases, it is then preferable to +// use "long" instead of "long long". +typedef unsigned long uint64_t; +typedef long int64_t; #else - typedef ::uint64_t uint64_t; - typedef ::int64_t int64_t; +typedef ::uint64_t uint64_t; +typedef ::int64_t int64_t; #endif - - - - // Make math functions from math.h available in vml_std - // (We could instead take some of them -- but not all -- from std.) - - inline float acos(float x) { return ::acosf(x); } - inline float acosh(float x) { return ::acoshf(x); } - inline float asin(float x) { return ::asinf(x); } - inline float asinh(float x) { return ::asinhf(x); } - inline float atan(float x) { return ::atanf(x); } - inline float atan2(float x, float y) { return ::atan2f(x, y); } - inline float atanh(float x) { return ::atanhf(x); } - inline float cbrt(float x) { return ::cbrtf(x); } - inline float ceil(float x) { return ::ceilf(x); } - inline float cos(float x) { return ::cosf(x); } - inline float cosh(float x) { return ::coshf(x); } - inline float copysign(float x, float y) { return ::copysignf(x, y); } - inline float exp(float x) { return ::expf(x); } - inline float exp2(float x) { return ::exp2f(x); } - inline float expm1(float x) { return ::expm1f(x); } - inline float fabs(float x) { return ::fabsf(x); } - inline float fdim(float x, float y) { return ::fdimf(x, y); } - inline float floor(float x) { return ::floorf(x); } - inline float fma(float x, float y, float z) { return ::fmaf(x, y, z); } - inline float fmax(float x, float y) { return ::fmaxf(x, y); } - inline float fmin(float x, float y) { return ::fminf(x, y); } - inline float fmod(float x, float y) { return ::fmodf(x, y); } - inline float frexp(float x, int* r) { return ::frexpf(x, r); } - inline float hypot(float x, float y) { return ::hypotf(x, y); } - inline int ilogb(float x) { return ::ilogbf(x); } - inline bool isfinite(float x) { return libc_isfinite(x); } - inline bool isinf(float x) { return libc_isinf(x); } - inline bool isnan(float x) { return libc_isnan(x); } - inline bool isnormal(float x) { return libc_isnormal(x); } - inline float ldexp(float x, int n) { return ::ldexpf(x, n); } - inline long long llrint(float x) { return ::llrintf(x); } - inline float log(float x) { return ::logf(x); } - inline float log10(float x) { return ::log10f(x); } - inline float log1p(float x) { return ::log1pf(x); } - inline float log2(float x) { return ::log2f(x); } - inline long lrint(float x) { return ::lrintf(x); } - inline float nextafter(float x, float y) { return ::nextafterf(x, y); } - inline float pow(float x, float y) { return ::powf(x, y); } - inline float remainder(float x, float y) { return ::remainderf(x, y); } - inline float rint(float x) { return ::rintf(x); } - inline float round(float x) { return ::roundf(x); } - inline bool signbit(float x) { return libc_signbit(x); } - inline float sin(float x) { return ::sinf(x); } - inline float sinh(float x) { return ::sinhf(x); } - inline float sqrt(float x) { return ::sqrtf(x); } - inline float tan(float x) { return ::tanf(x); } - inline float tanh(float x) { return ::tanhf(x); } - inline float trunc(float x) { return ::truncf(x); } - - inline double acos(double x) { return ::acos(x); } - inline double acosh(double x) { return ::acosh(x); } - inline double asin(double x) { return ::asin(x); } - inline double asinh(double x) { return ::asinh(x); } - inline double atan(double x) { return ::atan(x); } - inline double atan2(double x, double y) { return ::atan2(x, y); } - inline double atanh(double x) { return ::atanh(x); } - inline double cbrt(double x) { return ::cbrt(x); } - inline double ceil(double x) { return ::ceil(x); } - inline double cos(double x) { return ::cos(x); } - inline double cosh(double x) { return ::cosh(x); } - inline double copysign(double x, double y) { return ::copysign(x, y); } - inline double exp(double x) { return ::exp(x); } - inline double exp2(double x) { return ::exp2(x); } - inline double expm1(double x) { return ::expm1(x); } - inline double fabs(double x) { return ::fabs(x); } - inline double fdim(double x, double y) { return ::fdim(x, y); } - inline double floor(double x) { return ::floor(x); } - inline double fma(double x, double y, double z) { return ::fma(x, y, z); } - inline double fmax(double x, double y) { return ::fmax(x, y); } - inline double fmin(double x, double y) { return ::fmin(x, y); } - inline double fmod(double x, double y) { return ::fmod(x, y); } - inline double frexp(double x, int* r) { return ::frexp(x, r); } - inline double hypot(double x, double y) { return ::hypot(x, y); } - inline int ilogb(double x) { return ::ilogb(x); } - inline bool isfinite(double x) { return libc_isfinite(x); } - inline bool isinf(double x) { return libc_isinf(x); } - inline bool isnan(double x) { return libc_isnan(x); } - inline bool isnormal(double x) { return libc_isnormal(x); } - inline double ldexp(double x, int n) { return ::ldexp(x, n); } - inline long long llrint(double x) { return ::llrint(x); } - inline double log(double x) { return ::log(x); } - inline double log10(double x) { return ::log10(x); } - inline double log1p(double x) { return ::log1p(x); } - inline double log2(double x) { return ::log2(x); } - inline long lrint(double x) { return ::lrint(x); } - inline double nextafter(double x, double y) { return ::nextafter(x, y); } - inline double pow(double x, double y) { return ::pow(x, y); } - inline double remainder(double x, double y) { return ::remainder(x, y); } - inline double rint(double x) { return ::rint(x); } - inline double round(double x) { return ::round(x); } - inline bool signbit(double x) { return libc_signbit(x); } - inline double sin(double x) { return ::sin(x); } - inline double sinh(double x) { return ::sinh(x); } - inline double sqrt(double x) { return ::sqrt(x); } - inline double tan(double x) { return ::tan(x); } - inline double tanh(double x) { return ::tanh(x); } - inline double trunc(double x) { return ::trunc(x); } - + +// Make math functions from math.h available in vml_std +// (We could instead take some of them -- but not all -- from std.) + +inline float acos(float x) { return ::acosf(x); } +inline float acosh(float x) { return ::acoshf(x); } +inline float asin(float x) { return ::asinf(x); } +inline float asinh(float x) { return ::asinhf(x); } +inline float atan(float x) { return ::atanf(x); } +inline float atan2(float x, float y) { return ::atan2f(x, y); } +inline float atanh(float x) { return ::atanhf(x); } +inline float cbrt(float x) { return ::cbrtf(x); } +inline float ceil(float x) { return ::ceilf(x); } +inline float cos(float x) { return ::cosf(x); } +inline float cosh(float x) { return ::coshf(x); } +inline float copysign(float x, float y) { return ::copysignf(x, y); } +inline float exp(float x) { return ::expf(x); } +inline float exp2(float x) { return ::exp2f(x); } +inline float expm1(float x) { return ::expm1f(x); } +inline float fabs(float x) { return ::fabsf(x); } +inline float fdim(float x, float y) { return ::fdimf(x, y); } +inline float floor(float x) { return ::floorf(x); } +inline float fma(float x, float y, float z) { return ::fmaf(x, y, z); } +inline float fmax(float x, float y) { return ::fmaxf(x, y); } +inline float fmin(float x, float y) { return ::fminf(x, y); } +inline float fmod(float x, float y) { return ::fmodf(x, y); } +inline float frexp(float x, int *r) { return ::frexpf(x, r); } +inline float hypot(float x, float y) { return ::hypotf(x, y); } +inline int ilogb(float x) { return ::ilogbf(x); } +inline bool isfinite(float x) { return libc_isfinite(x); } +inline bool isinf(float x) { return libc_isinf(x); } +inline bool isnan(float x) { return libc_isnan(x); } +inline bool isnormal(float x) { return libc_isnormal(x); } +inline float ldexp(float x, int n) { return ::ldexpf(x, n); } +inline long long llrint(float x) { return ::llrintf(x); } +inline float log(float x) { return ::logf(x); } +inline float log10(float x) { return ::log10f(x); } +inline float log1p(float x) { return ::log1pf(x); } +inline float log2(float x) { return ::log2f(x); } +inline long lrint(float x) { return ::lrintf(x); } +inline float nextafter(float x, float y) { return ::nextafterf(x, y); } +inline float pow(float x, float y) { return ::powf(x, y); } +inline float remainder(float x, float y) { return ::remainderf(x, y); } +inline float rint(float x) { return ::rintf(x); } +inline float round(float x) { return ::roundf(x); } +inline bool signbit(float x) { return libc_signbit(x); } +inline float sin(float x) { return ::sinf(x); } +inline float sinh(float x) { return ::sinhf(x); } +inline float sqrt(float x) { return ::sqrtf(x); } +inline float tan(float x) { return ::tanf(x); } +inline float tanh(float x) { return ::tanhf(x); } +inline float trunc(float x) { return ::truncf(x); } + +inline double acos(double x) { return ::acos(x); } +inline double acosh(double x) { return ::acosh(x); } +inline double asin(double x) { return ::asin(x); } +inline double asinh(double x) { return ::asinh(x); } +inline double atan(double x) { return ::atan(x); } +inline double atan2(double x, double y) { return ::atan2(x, y); } +inline double atanh(double x) { return ::atanh(x); } +inline double cbrt(double x) { return ::cbrt(x); } +inline double ceil(double x) { return ::ceil(x); } +inline double cos(double x) { return ::cos(x); } +inline double cosh(double x) { return ::cosh(x); } +inline double copysign(double x, double y) { return ::copysign(x, y); } +inline double exp(double x) { return ::exp(x); } +inline double exp2(double x) { return ::exp2(x); } +inline double expm1(double x) { return ::expm1(x); } +inline double fabs(double x) { return ::fabs(x); } +inline double fdim(double x, double y) { return ::fdim(x, y); } +inline double floor(double x) { return ::floor(x); } +inline double fma(double x, double y, double z) { return ::fma(x, y, z); } +inline double fmax(double x, double y) { return ::fmax(x, y); } +inline double fmin(double x, double y) { return ::fmin(x, y); } +inline double fmod(double x, double y) { return ::fmod(x, y); } +inline double frexp(double x, int *r) { return ::frexp(x, r); } +inline double hypot(double x, double y) { return ::hypot(x, y); } +inline int ilogb(double x) { return ::ilogb(x); } +inline bool isfinite(double x) { return libc_isfinite(x); } +inline bool isinf(double x) { return libc_isinf(x); } +inline bool isnan(double x) { return libc_isnan(x); } +inline bool isnormal(double x) { return libc_isnormal(x); } +inline double ldexp(double x, int n) { return ::ldexp(x, n); } +inline long long llrint(double x) { return ::llrint(x); } +inline double log(double x) { return ::log(x); } +inline double log10(double x) { return ::log10(x); } +inline double log1p(double x) { return ::log1p(x); } +inline double log2(double x) { return ::log2(x); } +inline long lrint(double x) { return ::lrint(x); } +inline double nextafter(double x, double y) { return ::nextafter(x, y); } +inline double pow(double x, double y) { return ::pow(x, y); } +inline double remainder(double x, double y) { return ::remainder(x, y); } +inline double rint(double x) { return ::rint(x); } +inline double round(double x) { return ::round(x); } +inline bool signbit(double x) { return libc_signbit(x); } +inline double sin(double x) { return ::sin(x); } +inline double sinh(double x) { return ::sinh(x); } +inline double sqrt(double x) { return ::sqrt(x); } +inline double tan(double x) { return ::tan(x); } +inline double tanh(double x) { return ::tanh(x); } +inline double trunc(double x) { return ::trunc(x); } } #endif +namespace vecmathlib { + +struct fp8 { + // 1 bit sign, 4 bits exponent, 3 bits mantissa, exponent offset 7 (?) + vml_std::uint8_t val; + fp8() {} + fp8(double x) { __builtin_unreachable(); } +}; +struct fp16 { + // 1 bit sign, 5 bits exponent, 10 bits mantissa, exponent offset 15 (?) + vml_std::uint16_t val; + fp16() {} + fp16(double x) { __builtin_unreachable(); } +}; -namespace vecmathlib { - - struct fp8 { - // 1 bit sign, 4 bits exponent, 3 bits mantissa, exponent offset 7 (?) - vml_std::uint8_t val; - fp8() {} - fp8(double x) { __builtin_unreachable(); } - }; - - struct fp16 { - // 1 bit sign, 5 bits exponent, 10 bits mantissa, exponent offset 15 (?) - vml_std::uint16_t val; - fp16() {} - fp16(double x) { __builtin_unreachable(); } - }; - } // namespace vecmathlib -#endif // #ifndef FLOATTYPES_H +#endif // #ifndef FLOATTYPES_H diff --git a/instantiations.cc b/instantiations.cc index 9bd5351..956e1b9 100644 --- a/instantiations.cc +++ b/instantiations.cc @@ -7,84 +7,105 @@ #include "vecmathlib.h" +namespace vecmathlib { +template <typename realvec_t, int n> +typename realvec_t::real_t get_elt(realvec_t x) { + return x[n]; +} +template <typename realvec_t, int n> +realvec_t set_elt(realvec_t x, typename realvec_t::real_t a) { + return x.set_elt(n, a); +} + +// template realbuiltinvec<float,1> fabs(realbuiltinvec<float,1> x); +// template realbuiltinvec<float,1> fmin(realbuiltinvec<float,1> x, +// realbuiltinvec<float,1> y); +// template intbuiltinvec<float,1> lsr(intbuiltinvec<float,1> x, +// intbuiltinvec<float,1>::int_t n); +// template intbuiltinvec<double,1> lsr(intbuiltinvec<double,1> x, +// intbuiltinvec<double,1>::int_t n); +// template intbuiltinvec<double,2> lsr(intbuiltinvec<double,2> x, +// intbuiltinvec<double,2>::int_t n); +// template intbuiltinvec<double,2> lsr(intbuiltinvec<double,2> x, +// intbuiltinvec<double,2> n); +// template realbuiltinvec<float,1> ifthen(realbuiltinvec<float,1>::boolvec_t c, +// realbuiltinvec<float,1> x, realbuiltinvec<float,1> y); +// template realbuiltinvec<double,1> ifthen(realbuiltinvec<double,1>::boolvec_t +// c, realbuiltinvec<double,1> x, realbuiltinvec<double,1> y); +// template realbuiltinvec<float,4> ifthen(realbuiltinvec<float,4>::boolvec_t c, +// realbuiltinvec<float,4> x, realbuiltinvec<float,4> y); +// template realbuiltinvec<double,2> ifthen(realbuiltinvec<double,2>::boolvec_t +// c, realbuiltinvec<double,2> x, realbuiltinvec<double,2> y); -namespace vecmathlib { - - template<typename realvec_t, int n> - typename realvec_t::real_t get_elt(realvec_t x) - { - return x[n]; - } - template<typename realvec_t, int n> - realvec_t set_elt(realvec_t x, typename realvec_t::real_t a) - { - return x.set_elt(n, a); - } - - // template realbuiltinvec<float,1> fabs(realbuiltinvec<float,1> x); - // template realbuiltinvec<float,1> fmin(realbuiltinvec<float,1> x, realbuiltinvec<float,1> y); - // template intbuiltinvec<float,1> lsr(intbuiltinvec<float,1> x, intbuiltinvec<float,1>::int_t n); - // template intbuiltinvec<double,1> lsr(intbuiltinvec<double,1> x, intbuiltinvec<double,1>::int_t n); - // template intbuiltinvec<double,2> lsr(intbuiltinvec<double,2> x, intbuiltinvec<double,2>::int_t n); - // template intbuiltinvec<double,2> lsr(intbuiltinvec<double,2> x, intbuiltinvec<double,2> n); - // template realbuiltinvec<float,1> ifthen(realbuiltinvec<float,1>::boolvec_t c, realbuiltinvec<float,1> x, realbuiltinvec<float,1> y); - // template realbuiltinvec<double,1> ifthen(realbuiltinvec<double,1>::boolvec_t c, realbuiltinvec<double,1> x, realbuiltinvec<double,1> y); - // template realbuiltinvec<float,4> ifthen(realbuiltinvec<float,4>::boolvec_t c, realbuiltinvec<float,4> x, realbuiltinvec<float,4> y); - // template realbuiltinvec<double,2> ifthen(realbuiltinvec<double,2>::boolvec_t c, realbuiltinvec<double,2> x, realbuiltinvec<double,2> y); - #ifdef VECMATHLIB_HAVE_VEC_FLOAT_1 - template realvec<float,1> round(realvec<float,1> x); +template realvec<float, 1> round(realvec<float, 1> x); #endif - + #ifdef VECMATHLIB_HAVE_VEC_FLOAT_8 - template intvec<float,8> popcount(intvec<float,8>); +template intvec<float, 8> popcount(intvec<float, 8>); #endif - + #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_1 - template realvec<double,1> exp(realvec<double,1> x); - template realvec<double,1> log(realvec<double,1> x); - template realvec<double,1> sin(realvec<double,1> x); - template realvec<double,1> sqrt(realvec<double,1> x); - template realvec<double,1>::real_t get_elt<realvec<double,1>,0>(realvec<double,1> x); - template realvec<double,1> set_elt<realvec<double,1>,0>(realvec<double,1> x, realvec<double,1>::real_t a); +template realvec<double, 1> exp(realvec<double, 1> x); +template realvec<double, 1> log(realvec<double, 1> x); +template realvec<double, 1> sin(realvec<double, 1> x); +template realvec<double, 1> sqrt(realvec<double, 1> x); +template realvec<double, 1>::real_t +get_elt<realvec<double, 1>, 0>(realvec<double, 1> x); +template realvec<double, 1> +set_elt<realvec<double, 1>, 0>(realvec<double, 1> x, + realvec<double, 1>::real_t a); #endif - + #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_2 - template realvec<double,2> exp(realvec<double,2> x); - template realvec<double,2> log(realvec<double,2> x); - template realvec<double,2> sin(realvec<double,2> x); - template realvec<double,2> sqrt(realvec<double,2> x); - template realvec<double,2>::real_t get_elt<realvec<double,2>,0>(realvec<double,2>); - template realvec<double,2>::real_t get_elt<realvec<double,2>,1>(realvec<double,2>); - template realvec<double,2> set_elt<realvec<double,2>,0>(realvec<double,2> x, realvec<double,2>::real_t a); - template realvec<double,2> set_elt<realvec<double,2>,1>(realvec<double,2> x, realvec<double,2>::real_t a); +template realvec<double, 2> exp(realvec<double, 2> x); +template realvec<double, 2> log(realvec<double, 2> x); +template realvec<double, 2> sin(realvec<double, 2> x); +template realvec<double, 2> sqrt(realvec<double, 2> x); +template realvec<double, 2>::real_t +get_elt<realvec<double, 2>, 0>(realvec<double, 2>); +template realvec<double, 2>::real_t +get_elt<realvec<double, 2>, 1>(realvec<double, 2>); +template realvec<double, 2> +set_elt<realvec<double, 2>, 0>(realvec<double, 2> x, + realvec<double, 2>::real_t a); +template realvec<double, 2> +set_elt<realvec<double, 2>, 1>(realvec<double, 2> x, + realvec<double, 2>::real_t a); #endif - + #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_4 - template realvec<double,4> exp(realvec<double,4> x); - template realvec<double,4> log(realvec<double,4> x); - template realvec<double,4> sin(realvec<double,4> x); - template realvec<double,4> sqrt(realvec<double,4> x); - template realvec<double,4>::real_t get_elt<realvec<double,4>,0>(realvec<double,4>); - template realvec<double,4>::real_t get_elt<realvec<double,4>,1>(realvec<double,4>); - template realvec<double,4>::real_t get_elt<realvec<double,4>,2>(realvec<double,4>); - template realvec<double,4>::real_t get_elt<realvec<double,4>,3>(realvec<double,4>); - template realvec<double,4> set_elt<realvec<double,4>,0>(realvec<double,4> x, realvec<double,4>::real_t a); - template realvec<double,4> set_elt<realvec<double,4>,1>(realvec<double,4> x, realvec<double,4>::real_t a); - template realvec<double,4> set_elt<realvec<double,4>,2>(realvec<double,4> x, realvec<double,4>::real_t a); - template realvec<double,4> set_elt<realvec<double,4>,3>(realvec<double,4> x, realvec<double,4>::real_t a); - template intvec<double,4> popcount(intvec<double,4>); +template realvec<double, 4> exp(realvec<double, 4> x); +template realvec<double, 4> log(realvec<double, 4> x); +template realvec<double, 4> sin(realvec<double, 4> x); +template realvec<double, 4> sqrt(realvec<double, 4> x); +template realvec<double, 4>::real_t +get_elt<realvec<double, 4>, 0>(realvec<double, 4>); +template realvec<double, 4>::real_t +get_elt<realvec<double, 4>, 1>(realvec<double, 4>); +template realvec<double, 4>::real_t +get_elt<realvec<double, 4>, 2>(realvec<double, 4>); +template realvec<double, 4>::real_t +get_elt<realvec<double, 4>, 3>(realvec<double, 4>); +template realvec<double, 4> +set_elt<realvec<double, 4>, 0>(realvec<double, 4> x, + realvec<double, 4>::real_t a); +template realvec<double, 4> +set_elt<realvec<double, 4>, 1>(realvec<double, 4> x, + realvec<double, 4>::real_t a); +template realvec<double, 4> +set_elt<realvec<double, 4>, 2>(realvec<double, 4> x, + realvec<double, 4>::real_t a); +template realvec<double, 4> +set_elt<realvec<double, 4>, 3>(realvec<double, 4> x, + realvec<double, 4>::real_t a); +template intvec<double, 4> popcount(intvec<double, 4>); #endif - } - - // Various tests to detect auto-vectorization features - - #include <cassert> #include <cstdlib> using namespace std; @@ -92,32 +113,25 @@ using namespace std; using namespace vecmathlib; #if defined VECMATHLIB_HAVE_VEC_DOUBLE_4 -typedef realvec<double,4> realV; +typedef realvec<double, 4> realV; #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2 -typedef realvec<double,2> realV; +typedef realvec<double, 2> realV; #elif defined VECMATHLIB_HAVE_VEC_FLOAT_8 -typedef realvec<float,8> realV; +typedef realvec<float, 8> realV; #elif defined VECMATHLIB_HAVE_VEC_FLOAT_4 -typedef realvec<float,4> realV; +typedef realvec<float, 4> realV; #elif defined VECMATHLIB_HAVE_VEC_FLOAT_2 -typedef realvec<float,2> realV; +typedef realvec<float, 2> realV; #else -# error "There are no vector types" +#error "There are no vector types" #endif typedef realV::scalar_t real; const int vecsize = realV::size; - - // Simple, naive loop adding two arrays -extern "C" -void loop_add(real* a, - real* b, - real* c, - ptrdiff_t n) -{ - for (ptrdiff_t i=0; i<n; i+=vecsize) { +extern "C" void loop_add(real *a, real *b, real *c, ptrdiff_t n) { + for (ptrdiff_t i = 0; i < n; i += vecsize) { realV tmpb = realV::loadu(&b[i]); realV tmpc = realV::loadu(&c[i]); realV tmpa = tmpb + tmpc; @@ -125,16 +139,10 @@ void loop_add(real* a, } } - - // Declare pointers as restrict -extern "C" -void loop_add_restrict(real *restrict a, - real *restrict b, - real *restrict c, - ptrdiff_t n) -{ - for (ptrdiff_t i=0; i<n; i+=vecsize) { +extern "C" void loop_add_restrict(real *restrict a, real *restrict b, + real *restrict c, ptrdiff_t n) { + for (ptrdiff_t i = 0; i < n; i += vecsize) { realV tmpb = realV::loadu(&b[i]); realV tmpc = realV::loadu(&c[i]); realV tmpa = tmpb + tmpc; @@ -142,16 +150,10 @@ void loop_add_restrict(real *restrict a, } } - - // Declare pointers as restrict and aligned -extern "C" -void loop_add_aligned(real *restrict a, - real *restrict b, - real *restrict c, - ptrdiff_t n) -{ - for (ptrdiff_t i=0; i<n; i+=vecsize) { +extern "C" void loop_add_aligned(real *restrict a, real *restrict b, + real *restrict c, ptrdiff_t n) { + for (ptrdiff_t i = 0; i < n; i += vecsize) { realV tmpb = realV::loada(&b[i]); realV tmpc = realV::loada(&c[i]); realV tmpa = tmpb + tmpc; @@ -159,16 +161,11 @@ void loop_add_aligned(real *restrict a, } } - - // Reduction loop -extern "C" -real loop_dot_reduce(real *restrict a, - real *restrict b, - ptrdiff_t n) -{ +extern "C" real loop_dot_reduce(real *restrict a, real *restrict b, + ptrdiff_t n) { realV sumV = 0.0; - for (ptrdiff_t i=0; i<n; i+=vecsize) { + for (ptrdiff_t i = 0; i < n; i += vecsize) { realV tmpa = realV::loada(&a[i]); realV tmpb = realV::loada(&b[i]); sumV += tmpa * tmpb; @@ -176,16 +173,10 @@ real loop_dot_reduce(real *restrict a, return sum(sumV); } - - // Loop with a simple if condition (fmax) -extern "C" -void loop_if_simple(real *restrict a, - real *restrict b, - real *restrict c, - ptrdiff_t n) -{ - for (ptrdiff_t i=0; i<n; i+=vecsize) { +extern "C" void loop_if_simple(real *restrict a, real *restrict b, + real *restrict c, ptrdiff_t n) { + for (ptrdiff_t i = 0; i < n; i += vecsize) { realV tmpb = realV::loada(&b[i]); realV tmpc = realV::loada(&c[i]); realV tmpa = ifthen(tmpb > tmpc, tmpb, tmpc); @@ -193,16 +184,10 @@ void loop_if_simple(real *restrict a, } } - - // Loop with a complex if condition (select) -extern "C" -void loop_if(real *restrict a, - real *restrict b, - real *restrict c, - ptrdiff_t n) -{ - for (ptrdiff_t i=0; i<n; i+=vecsize) { +extern "C" void loop_if(real *restrict a, real *restrict b, real *restrict c, + ptrdiff_t n) { + for (ptrdiff_t i = 0; i < n; i += vecsize) { realV tmpb = realV::loada(&b[i]); realV tmpc = realV::loada(&c[i]); realV tmpa = ifthen(tmpb > realV(0.0), tmpb * tmpc, realV(1.0)); @@ -210,16 +195,10 @@ void loop_if(real *restrict a, } } - - // Skip ghost points -extern "C" -void loop_add_masked(real *restrict a, - real *restrict b, - real *restrict c, - ptrdiff_t n) -{ - for (realV::mask_t mask(1, n-1, 0); mask; ++mask) { +extern "C" void loop_add_masked(real *restrict a, real *restrict b, + real *restrict c, ptrdiff_t n) { + for (realV::mask_t mask(1, n - 1, 0); mask; ++mask) { ptrdiff_t i = mask.index(); realV tmpb = realV::loada(&b[i]); realV tmpc = realV::loada(&c[i]); @@ -13,12 +13,8 @@ typedef realvec_t::real_t real_t; typedef realvec_t::intvec_t intvec_t; typedef intvec_t::int_t int_t; - - -realvec_t interp(const real_t* array, ptrdiff_t size, - real_t xmin, real_t xmax, - realvec_t x) -{ +realvec_t interp(const real_t *array, ptrdiff_t size, real_t xmin, real_t xmax, + realvec_t x) { assert(size >= 2); // spacing real_t dx = (xmax - xmin) / (size - 1); @@ -29,11 +25,11 @@ realvec_t interp(const real_t* array, ptrdiff_t size, intvec_t n = convert_int(cell); // gather values from array realvec_t x0, x1; - for (ptrdiff_t i=0; i<realvec_t::size; ++i) { + for (ptrdiff_t i = 0; i < realvec_t::size; ++i) { // ensure location is not out of bounds - ptrdiff_t j = max(ptrdiff_t(0), min(size-2, ptrdiff_t(n[i]))); + ptrdiff_t j = max(ptrdiff_t(0), min(size - 2, ptrdiff_t(n[i]))); x0.set_elt(i, array[j]); - x1.set_elt(i, array[j+1]); + x1.set_elt(i, array[j + 1]); } // determine interpolation weights realvec_t offset = scaled - cell; @@ -44,20 +40,18 @@ realvec_t interp(const real_t* array, ptrdiff_t size, return y; } - - -int main(int argc, char** argv) -{ +int main(int argc, char **argv) { ptrdiff_t size = 1001; vector<real_t> array(size); - for (ptrdiff_t i=0; i<size; ++i) array[i] = real_t(i) / 1000.0; - + for (ptrdiff_t i = 0; i < size; ++i) + array[i] = real_t(i) / 1000.0; + real_t xmin = 0.0; real_t xmax = 0.5; realvec_t x = 0.333; cout << "x=" << x << "\n"; realvec_t y = interp(&array[0], size, xmin, xmax, x); cout << "y=" << y << "\n"; - + return 0; } @@ -14,68 +14,57 @@ using namespace std; using namespace vecmathlib; - - //////////////////////////////////////////////////////////////////////////////// // Helpers //////////////////////////////////////////////////////////////////////////////// #ifndef __has_builtin -# define __has_builtin(x) 0 // Compatibility with non-clang compilers +#define __has_builtin(x) 0 // Compatibility with non-clang compilers #endif // align upwards -static size_t align_up(size_t i, size_t size) -{ +static size_t align_up(size_t i, size_t size) { return (i + size - 1) / size * size; } - - //////////////////////////////////////////////////////////////////////////////// // High-resolution timer //////////////////////////////////////////////////////////////////////////////// typedef unsigned long long ticks; -inline ticks getticks() -{ +inline ticks getticks() { #if __has_builtin(__builtin_readcyclecounter) return __builtin_readcyclecounter(); #elif defined __x86_64__ ticks a, d; - asm volatile("rdtsc" : "=a" (a), "=d" (d)); + asm volatile("rdtsc" : "=a"(a), "=d"(d)); return a | (d << 32); #elif defined __powerpc__ unsigned int tbl, tbu, tbu1; do { - asm volatile("mftbu %0": "=r"(tbu)); - asm volatile("mftb %0": "=r"(tbl)); - asm volatile("mftbu %0": "=r"(tbu1)); + asm volatile("mftbu %0" : "=r"(tbu)); + asm volatile("mftb %0" : "=r"(tbl)); + asm volatile("mftbu %0" : "=r"(tbu1)); } while (tbu != tbu1); return ((unsigned long long)tbu << 32) | tbl; #else timeval tv; gettimeofday(&tv, NULL); return 1000000ULL * tv.tv_sec + tv.tv_usec; - // timespec ts; - // clock_gettime(CLOCK_REALTIME, &ts); - // return 1000000000ULL * ts.tv_sec + ts.tv_nsec; +// timespec ts; +// clock_gettime(CLOCK_REALTIME, &ts); +// return 1000000000ULL * ts.tv_sec + ts.tv_nsec; #endif } -inline double elapsed(ticks t1, ticks t0) -{ - return t1-t0; -} +inline double elapsed(ticks t1, ticks t0) { return t1 - t0; } -double get_sys_time() -{ +double get_sys_time() { timeval tp; gettimeofday(&tp, NULL); return tp.tv_sec + 1.0e-6 * tp.tv_usec; } -double measure_tick() -{ +double measure_tick() { ticks const rstart = getticks(); double const wstart = get_sys_time(); while (get_sys_time() - wstart < 0.1) { @@ -83,236 +72,219 @@ double measure_tick() } ticks const rend = getticks(); double const wend = get_sys_time(); - assert(wend-wstart >= 0.09); + assert(wend - wstart >= 0.09); return (wend - wstart) / elapsed(rend, rstart); } - - //////////////////////////////////////////////////////////////////////////////// // Initialize the grid //////////////////////////////////////////////////////////////////////////////// -template<typename realvec_t> -void init(typename realvec_t::real_t *restrict xptr, - ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) -{ - for (ptrdiff_t j=0; j<n; ++j) { - for (ptrdiff_t i=0; i<m; ++i) { - const ptrdiff_t ij = ldm*j + i; - xptr[ij] = (i+j)%2; +template <typename realvec_t> +void init(typename realvec_t::real_t *restrict xptr, ptrdiff_t m, ptrdiff_t ldm, + ptrdiff_t n) { + for (ptrdiff_t j = 0; j < n; ++j) { + for (ptrdiff_t i = 0; i < m; ++i) { + const ptrdiff_t ij = ldm * j + i; + xptr[ij] = (i + j) % 2; } } } - - //////////////////////////////////////////////////////////////////////////////// // Evolution loop: Simple stencil example (Gaussian smoothing) //////////////////////////////////////////////////////////////////////////////// // Introduce a delay, so that cache access is not so important -template<typename T> -static T delay(const T x) -{ +template <typename T> static T delay(const T x) { return x; // return log(exp(x)); } // Original version, unvectorized -template<typename realvec_t> +template <typename realvec_t> void smooth_scalar(typename realvec_t::real_t const *restrict xptr, - typename realvec_t::real_t *restrict yptr, - ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) -{ + typename realvec_t::real_t *restrict yptr, ptrdiff_t m, + ptrdiff_t ldm, ptrdiff_t n) { typedef typename realvec_t::real_t real_t; - for (ptrdiff_t j=1; j<n-1; ++j) { - for (ptrdiff_t i=1; i<m-1; ++i) { - const ptrdiff_t ij = ldm*j + i; - const real_t x = xptr[ij]; - const real_t xil = xptr[ij-1]; - const real_t xir = xptr[ij+1]; - const real_t xjl = xptr[ij-ldm]; - const real_t xjr = xptr[ij+ldm]; + for (ptrdiff_t j = 1; j < n - 1; ++j) { + for (ptrdiff_t i = 1; i < m - 1; ++i) { + const ptrdiff_t ij = ldm * j + i; + const real_t x = xptr[ij]; + const real_t xil = xptr[ij - 1]; + const real_t xir = xptr[ij + 1]; + const real_t xjl = xptr[ij - ldm]; + const real_t xjr = xptr[ij + ldm]; const real_t y = - real_t(0.5) * x + real_t(0.125) * (xil + xir + xjl + xjr); + real_t(0.5) * x + real_t(0.125) * (xil + xir + xjl + xjr); yptr[ij] = delay(y); } } } - - // Assuming no particular alignment -template<typename realvec_t> +template <typename realvec_t> void smooth_unaligned(typename realvec_t::real_t const *restrict xptr, - typename realvec_t::real_t *restrict yptr, - ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) -{ + typename realvec_t::real_t *restrict yptr, ptrdiff_t m, + ptrdiff_t ldm, ptrdiff_t n) { typedef typename realvec_t::real_t real_t; typedef typename realvec_t::mask_t mask_t; - for (ptrdiff_t j=1; j<n-1; ++j) { + for (ptrdiff_t j = 1; j < n - 1; ++j) { // Desired loop bounds const ptrdiff_t imin = 1; - const ptrdiff_t imax = m-1; + const ptrdiff_t imax = m - 1; // Align actual loop iterations with vector size - const ptrdiff_t ioff = ldm*j; + const ptrdiff_t ioff = ldm * j; for (mask_t mask(imin, imax, ioff); mask; ++mask) { const ptrdiff_t i = mask.index(); const ptrdiff_t ij = ioff + i; - const realvec_t x = realvec_t::loadu(xptr+ij); - const realvec_t xil = realvec_t::loadu(xptr+ij, -1); - const realvec_t xir = realvec_t::loadu(xptr+ij, +1); - const realvec_t xjl = realvec_t::loadu(xptr+ij-ldm); - const realvec_t xjr = realvec_t::loadu(xptr+ij+ldm); - const realvec_t y = - realvec_t(real_t(0.5)) * x + - realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); - storeu(delay(y), yptr+ij, mask); + const realvec_t x = realvec_t::loadu(xptr + ij); + const realvec_t xil = realvec_t::loadu(xptr + ij, -1); + const realvec_t xir = realvec_t::loadu(xptr + ij, +1); + const realvec_t xjl = realvec_t::loadu(xptr + ij - ldm); + const realvec_t xjr = realvec_t::loadu(xptr + ij + ldm); + const realvec_t y = realvec_t(real_t(0.5)) * x + + realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); + storeu(delay(y), yptr + ij, mask); } } } - - // Assuming that xptr and yptr are aligned, but ldm can be arbitrary -template<typename realvec_t> +template <typename realvec_t> void smooth_aligned(typename realvec_t::real_t const *restrict xptr, - typename realvec_t::real_t *restrict yptr, - ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) -{ + typename realvec_t::real_t *restrict yptr, ptrdiff_t m, + ptrdiff_t ldm, ptrdiff_t n) { typedef typename realvec_t::real_t real_t; typedef typename realvec_t::mask_t mask_t; - for (ptrdiff_t j=1; j<n-1; ++j) { + for (ptrdiff_t j = 1; j < n - 1; ++j) { // Desired loop bounds const ptrdiff_t imin = 1; - const ptrdiff_t imax = m-1; + const ptrdiff_t imax = m - 1; // Align actual loop iterations with vector size - const ptrdiff_t ioff = ldm*j; + const ptrdiff_t ioff = ldm * j; for (mask_t mask(imin, imax, ioff); mask; ++mask) { const ptrdiff_t i = mask.index(); const ptrdiff_t ij = ioff + i; - const realvec_t x = realvec_t::loada(xptr+ij); - const realvec_t xil = realvec_t::loadu(xptr+ij, -1); - const realvec_t xir = realvec_t::loadu(xptr+ij, +1); - const realvec_t xjl = realvec_t::loadu(xptr+ij-ldm); - const realvec_t xjr = realvec_t::loadu(xptr+ij+ldm); - const realvec_t y = - realvec_t(real_t(0.5)) * x + - realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); - storea(delay(y), yptr+ij, mask); + const realvec_t x = realvec_t::loada(xptr + ij); + const realvec_t xil = realvec_t::loadu(xptr + ij, -1); + const realvec_t xir = realvec_t::loadu(xptr + ij, +1); + const realvec_t xjl = realvec_t::loadu(xptr + ij - ldm); + const realvec_t xjr = realvec_t::loadu(xptr + ij + ldm); + const realvec_t y = realvec_t(real_t(0.5)) * x + + realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); + storea(delay(y), yptr + ij, mask); } } } - - // Assuming that xptr and yptr are aligned, and ldm is a multiple of // the vector size -template<typename realvec_t> +template <typename realvec_t> void smooth_padded(typename realvec_t::real_t const *restrict xptr, - typename realvec_t::real_t *restrict yptr, - ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) -{ + typename realvec_t::real_t *restrict yptr, ptrdiff_t m, + ptrdiff_t ldm, ptrdiff_t n) { typedef typename realvec_t::real_t real_t; typedef typename realvec_t::mask_t mask_t; assert(ldm % realvec_t::size == 0); - for (ptrdiff_t j=1; j<n-1; ++j) { + for (ptrdiff_t j = 1; j < n - 1; ++j) { // Desired loop bounds const ptrdiff_t imin = 1; - const ptrdiff_t imax = m-1; + const ptrdiff_t imax = m - 1; // Align actual loop iterations with vector size - const ptrdiff_t ioff = ldm*j; + const ptrdiff_t ioff = ldm * j; for (mask_t mask(imin, imax, ioff); mask; ++mask) { const ptrdiff_t i = mask.index(); const ptrdiff_t ij = ioff + i; - const realvec_t x = realvec_t::loada(xptr+ij); - const realvec_t xil = realvec_t::loadu(xptr+ij, -1); - const realvec_t xir = realvec_t::loadu(xptr+ij, +1); - const realvec_t xjl = realvec_t::loada(xptr+ij-ldm); - const realvec_t xjr = realvec_t::loada(xptr+ij+ldm); - const realvec_t y = - realvec_t(real_t(0.5)) * x + - realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); - storea(delay(y), yptr+ij, mask); + const realvec_t x = realvec_t::loada(xptr + ij); + const realvec_t xil = realvec_t::loadu(xptr + ij, -1); + const realvec_t xir = realvec_t::loadu(xptr + ij, +1); + const realvec_t xjl = realvec_t::loada(xptr + ij - ldm); + const realvec_t xjr = realvec_t::loada(xptr + ij + ldm); + const realvec_t y = realvec_t(real_t(0.5)) * x + + realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); + storea(delay(y), yptr + ij, mask); } } } - - //////////////////////////////////////////////////////////////////////////////// // Main routine //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char** argv) -{ +int main(int argc, char **argv) { // Number of iterations const int niters = 100; - + // Grid size const ptrdiff_t m = 100; const ptrdiff_t n = 100; - - // Choose a vector size + +// Choose a vector size #if defined VECMATHLIB_HAVE_VEC_DOUBLE_4 - typedef realvec<double,4> realvec_t; + typedef realvec<double, 4> realvec_t; #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2 - typedef realvec<double,2> realvec_t; + typedef realvec<double, 2> realvec_t; #else - typedef realpseudovec<double,1> realvec_t; + typedef realpseudovec<double, 1> realvec_t; #endif - + // Ensure the grid size is aligned const ptrdiff_t ldm = align_up(m, realvec_t::size); typedef realvec_t::real_t real_t; - vector<real_t> x0(ldm*n + realvec_t::size-1), y0(ldm*n + realvec_t::size-1); - real_t* restrict const x = - (real_t*)align_up(intptr_t(&x0[0]), sizeof(realvec_t)); - real_t* restrict const y = - (real_t*)align_up(intptr_t(&y0[0]), sizeof(realvec_t)); - for (ptrdiff_t i=0; i<ldm*n; ++i) y[i] = 0.0; - + vector<real_t> x0(ldm * n + realvec_t::size - 1), + y0(ldm * n + realvec_t::size - 1); + real_t *restrict const x = + (real_t *)align_up(intptr_t(&x0[0]), sizeof(realvec_t)); + real_t *restrict const y = + (real_t *)align_up(intptr_t(&y0[0]), sizeof(realvec_t)); + for (ptrdiff_t i = 0; i < ldm * n; ++i) + y[i] = 0.0; + // Initialize init<realvec_t>(&x[0], m, ldm, n); - + // Timers ticks t0, t1; double const cycles_per_tick = 1.0; // measure_tick(); double cycles; - + // Run the different evolution loop versions t0 = getticks(); - for (int iter=0; iter<niters; ++iter) { + for (int iter = 0; iter < niters; ++iter) { smooth_scalar<realvec_t>(&x[0], &y[0], m, ldm, n); } t1 = getticks(); - cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); + cycles = + cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters); cout << "smooth_scalar: " << cycles << " cycles/point\n"; - + t0 = getticks(); - for (int iter=0; iter<niters; ++iter) { + for (int iter = 0; iter < niters; ++iter) { smooth_unaligned<realvec_t>(&x[0], &y[0], m, ldm, n); } t1 = getticks(); - cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); + cycles = + cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters); cout << "smooth_unaligned: " << cycles << " cycles/point\n"; - + t0 = getticks(); - for (int iter=0; iter<niters; ++iter) { + for (int iter = 0; iter < niters; ++iter) { smooth_aligned<realvec_t>(&x[0], &y[0], m, ldm, n); } t1 = getticks(); - cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); + cycles = + cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters); cout << "smooth_aligned: " << cycles << " cycles/point\n"; - + t0 = getticks(); - for (int iter=0; iter<niters; ++iter) { + for (int iter = 0; iter < niters; ++iter) { smooth_padded<realvec_t>(&x[0], &y[0], m, ldm, n); } t1 = getticks(); - cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); + cycles = + cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters); cout << "smooth_padded: " << cycles << " cycles/point\n"; - + return 0; } diff --git a/mathfuncs.h b/mathfuncs.h index 8d90f9a..9f042d1 100644 --- a/mathfuncs.h +++ b/mathfuncs.h @@ -19,4 +19,4 @@ #include "mathfuncs_sinh.h" #include "mathfuncs_sqrt.h" -#endif // #ifndef MATHFUNCS_H +#endif // #ifndef MATHFUNCS_H diff --git a/mathfuncs_asin.h b/mathfuncs_asin.h index 3dd9c75..cd174a2 100644 --- a/mathfuncs_asin.h +++ b/mathfuncs_asin.h @@ -7,206 +7,181 @@ #include <cmath> +namespace vecmathlib { +namespace { -namespace vecmathlib { - - - - namespace { - - template<typename realvec_t> - realvec_t mulsign(realvec_t x, realvec_t y) - { - typedef typename realvec_t::real_t real_t; - typedef typename realvec_t::intvec_t intvec_t; - typedef intvec_t IV; - typedef floatprops<real_t> FP; - - intvec_t value = as_int(x); - intvec_t sign = as_int(y) & IV(FP::signbit_mask); - return as_float(value ^ sign); - } - - // Note: the order of arguments is y, x, as is convention for atan2 - template<typename realvec_t> - realvec_t atan2k(realvec_t y, realvec_t x) - { - // Algorithm taken from SLEEF 2.80 - - typedef typename realvec_t::real_t real_t; - typedef typename realvec_t::boolvec_t boolvec_t; - typedef realvec_t RV; - - realvec_t q = RV(0.0); - - q = ifthen(signbit(x), RV(-2.0), q); - x = fabs(x); - - boolvec_t cond = y > x; - realvec_t x0 = x; - realvec_t y0 = y; - x = ifthen(cond, y0, x0); - y = ifthen(cond, -x0, y0); - q += ifthen(cond, RV(1.0), RV(0.0)); - - realvec_t s = y / x; - realvec_t t = s * s; - - realvec_t u; - switch (sizeof(real_t)) { - default: __builtin_unreachable(); - case sizeof(float): - u = RV(0.00282363896258175373077393f); - u = mad(u, t, RV(-0.0159569028764963150024414f)); - u = mad(u, t, RV(0.0425049886107444763183594f)); - u = mad(u, t, RV(-0.0748900920152664184570312f)); - u = mad(u, t, RV(0.106347933411598205566406f)); - u = mad(u, t, RV(-0.142027363181114196777344f)); - u = mad(u, t, RV(0.199926957488059997558594f)); - u = mad(u, t, RV(-0.333331018686294555664062f)); - break; - case sizeof(double): - u = RV(-1.88796008463073496563746e-05); - u = mad(u, t, RV(0.000209850076645816976906797)); - u = mad(u, t, RV(-0.00110611831486672482563471)); - u = mad(u, t, RV(0.00370026744188713119232403)); - u = mad(u, t, RV(-0.00889896195887655491740809)); - u = mad(u, t, RV(0.016599329773529201970117)); - u = mad(u, t, RV(-0.0254517624932312641616861)); - u = mad(u, t, RV(0.0337852580001353069993897)); - u = mad(u, t, RV(-0.0407629191276836500001934)); - u = mad(u, t, RV(0.0466667150077840625632675)); - u = mad(u, t, RV(-0.0523674852303482457616113)); - u = mad(u, t, RV(0.0587666392926673580854313)); - u = mad(u, t, RV(-0.0666573579361080525984562)); - u = mad(u, t, RV(0.0769219538311769618355029)); - u = mad(u, t, RV(-0.090908995008245008229153)); - u = mad(u, t, RV(0.111111105648261418443745)); - u = mad(u, t, RV(-0.14285714266771329383765)); - u = mad(u, t, RV(0.199999999996591265594148)); - u = mad(u, t, RV(-0.333333333333311110369124)); - break; - } - - t = mad(u, t * s, s); - t = mad(q, RV(M_PI_2), t); - - return t; - } - - } - +template <typename realvec_t> realvec_t mulsign(realvec_t x, realvec_t y) { + typedef typename realvec_t::real_t real_t; + typedef typename realvec_t::intvec_t intvec_t; + typedef intvec_t IV; + typedef floatprops<real_t> FP; + intvec_t value = as_int(x); + intvec_t sign = as_int(y) & IV(FP::signbit_mask); + return as_float(value ^ sign); +} - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_asin(realvec_t d) - { - // Algorithm taken from SLEEF 2.80 - return mulsign(atan2k(fabs(d), sqrt((RV(1.0)+d)*(RV(1.0)-d))), d); - } - - - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_acos(realvec_t d) - { - // Algorithm taken from SLEEF 2.80 - return (mulsign(atan2k(sqrt((RV(1.0)+d)*(RV(1.0)-d)), fabs(d)), d) + - ifthen(d < RV(0.0), RV(M_PI), RV(0.0))); - } - - - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_atan(realvec_t s) - { - // Algorithm taken from SLEEF 2.80 - - realvec_t q1 = s; - s = fabs(s); - - boolvec_t q0 = s > RV(1.0); - s = ifthen(q0, rcp(s), s); - - realvec_t t = s * s; - - realvec_t u; - switch (sizeof(real_t)) { - default: __builtin_unreachable(); - case sizeof(float): - u = RV(0.00282363896258175373077393f); - u = mad(u, t, RV(-0.0159569028764963150024414f)); - u = mad(u, t, RV(0.0425049886107444763183594f)); - u = mad(u, t, RV(-0.0748900920152664184570312f)); - u = mad(u, t, RV(0.106347933411598205566406f)); - u = mad(u, t, RV(-0.142027363181114196777344f)); - u = mad(u, t, RV(0.199926957488059997558594f)); - u = mad(u, t, RV(-0.333331018686294555664062f)); - break; - case sizeof(double): - u = RV(-1.88796008463073496563746e-05); - u = mad(u, t, RV(0.000209850076645816976906797)); - u = mad(u, t, RV(-0.00110611831486672482563471)); - u = mad(u, t, RV(0.00370026744188713119232403)); - u = mad(u, t, RV(-0.00889896195887655491740809)); - u = mad(u, t, RV(0.016599329773529201970117)); - u = mad(u, t, RV(-0.0254517624932312641616861)); - u = mad(u, t, RV(0.0337852580001353069993897)); - u = mad(u, t, RV(-0.0407629191276836500001934)); - u = mad(u, t, RV(0.0466667150077840625632675)); - u = mad(u, t, RV(-0.0523674852303482457616113)); - u = mad(u, t, RV(0.0587666392926673580854313)); - u = mad(u, t, RV(-0.0666573579361080525984562)); - u = mad(u, t, RV(0.0769219538311769618355029)); - u = mad(u, t, RV(-0.090908995008245008229153)); - u = mad(u, t, RV(0.111111105648261418443745)); - u = mad(u, t, RV(-0.14285714266771329383765)); - u = mad(u, t, RV(0.199999999996591265594148)); - u = mad(u, t, RV(-0.333333333333311110369124)); - break; - } - - t = s + s * (t * u); - - t = ifthen(q0, RV(M_PI_2) - t, t); - t = copysign(t, q1); - - return t; +// Note: the order of arguments is y, x, as is convention for atan2 +template <typename realvec_t> realvec_t atan2k(realvec_t y, realvec_t x) { + // Algorithm taken from SLEEF 2.80 + + typedef typename realvec_t::real_t real_t; + typedef typename realvec_t::boolvec_t boolvec_t; + typedef realvec_t RV; + + realvec_t q = RV(0.0); + + q = ifthen(signbit(x), RV(-2.0), q); + x = fabs(x); + + boolvec_t cond = y > x; + realvec_t x0 = x; + realvec_t y0 = y; + x = ifthen(cond, y0, x0); + y = ifthen(cond, -x0, y0); + q += ifthen(cond, RV(1.0), RV(0.0)); + + realvec_t s = y / x; + realvec_t t = s * s; + + realvec_t u; + switch (sizeof(real_t)) { + default: + __builtin_unreachable(); + case sizeof(float): + u = RV(0.00282363896258175373077393f); + u = mad(u, t, RV(-0.0159569028764963150024414f)); + u = mad(u, t, RV(0.0425049886107444763183594f)); + u = mad(u, t, RV(-0.0748900920152664184570312f)); + u = mad(u, t, RV(0.106347933411598205566406f)); + u = mad(u, t, RV(-0.142027363181114196777344f)); + u = mad(u, t, RV(0.199926957488059997558594f)); + u = mad(u, t, RV(-0.333331018686294555664062f)); + break; + case sizeof(double): + u = RV(-1.88796008463073496563746e-05); + u = mad(u, t, RV(0.000209850076645816976906797)); + u = mad(u, t, RV(-0.00110611831486672482563471)); + u = mad(u, t, RV(0.00370026744188713119232403)); + u = mad(u, t, RV(-0.00889896195887655491740809)); + u = mad(u, t, RV(0.016599329773529201970117)); + u = mad(u, t, RV(-0.0254517624932312641616861)); + u = mad(u, t, RV(0.0337852580001353069993897)); + u = mad(u, t, RV(-0.0407629191276836500001934)); + u = mad(u, t, RV(0.0466667150077840625632675)); + u = mad(u, t, RV(-0.0523674852303482457616113)); + u = mad(u, t, RV(0.0587666392926673580854313)); + u = mad(u, t, RV(-0.0666573579361080525984562)); + u = mad(u, t, RV(0.0769219538311769618355029)); + u = mad(u, t, RV(-0.090908995008245008229153)); + u = mad(u, t, RV(0.111111105648261418443745)); + u = mad(u, t, RV(-0.14285714266771329383765)); + u = mad(u, t, RV(0.199999999996591265594148)); + u = mad(u, t, RV(-0.333333333333311110369124)); + break; } - - - // Note: the order of arguments is y, x, as is convention for atan2 - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_atan2(realvec_t y, realvec_t x) - { - // Algorithm taken from SLEEF 2.80 - - realvec_t r = atan2k(fabs(y), x); - - r = mulsign(r, x); - - r = ifthen(isinf(x) || x == RV(0.0), - ifthen(isinf(x), - RV(M_PI_2) - copysign(RV(M_PI_2), x), - RV(M_PI_2)), - r); - - r = ifthen(isinf(y), - ifthen(isinf(x), - RV(M_PI_2) - copysign(RV(M_PI_4), x), - RV(M_PI_2)), - r); - - r = ifthen(y == RV(0.0), - ifthen(signbit(x), RV(M_PI), RV(0.0)), - r); - - const real_t nan = std::numeric_limits<real_t>::quiet_NaN(); - return ifthen(isnan(x) || isnan(y), RV(nan), mulsign(r, y)); + t = mad(u, t * s, s); + t = mad(q, RV(M_PI_2), t); + + return t; +} +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_asin(realvec_t d) { + // Algorithm taken from SLEEF 2.80 + return mulsign(atan2k(fabs(d), sqrt((RV(1.0) + d) * (RV(1.0) - d))), d); +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_acos(realvec_t d) { + // Algorithm taken from SLEEF 2.80 + return (mulsign(atan2k(sqrt((RV(1.0) + d) * (RV(1.0) - d)), fabs(d)), d) + + ifthen(d < RV(0.0), RV(M_PI), RV(0.0))); +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_atan(realvec_t s) { + // Algorithm taken from SLEEF 2.80 + + realvec_t q1 = s; + s = fabs(s); + + boolvec_t q0 = s > RV(1.0); + s = ifthen(q0, rcp(s), s); + + realvec_t t = s * s; + + realvec_t u; + switch (sizeof(real_t)) { + default: + __builtin_unreachable(); + case sizeof(float): + u = RV(0.00282363896258175373077393f); + u = mad(u, t, RV(-0.0159569028764963150024414f)); + u = mad(u, t, RV(0.0425049886107444763183594f)); + u = mad(u, t, RV(-0.0748900920152664184570312f)); + u = mad(u, t, RV(0.106347933411598205566406f)); + u = mad(u, t, RV(-0.142027363181114196777344f)); + u = mad(u, t, RV(0.199926957488059997558594f)); + u = mad(u, t, RV(-0.333331018686294555664062f)); + break; + case sizeof(double): + u = RV(-1.88796008463073496563746e-05); + u = mad(u, t, RV(0.000209850076645816976906797)); + u = mad(u, t, RV(-0.00110611831486672482563471)); + u = mad(u, t, RV(0.00370026744188713119232403)); + u = mad(u, t, RV(-0.00889896195887655491740809)); + u = mad(u, t, RV(0.016599329773529201970117)); + u = mad(u, t, RV(-0.0254517624932312641616861)); + u = mad(u, t, RV(0.0337852580001353069993897)); + u = mad(u, t, RV(-0.0407629191276836500001934)); + u = mad(u, t, RV(0.0466667150077840625632675)); + u = mad(u, t, RV(-0.0523674852303482457616113)); + u = mad(u, t, RV(0.0587666392926673580854313)); + u = mad(u, t, RV(-0.0666573579361080525984562)); + u = mad(u, t, RV(0.0769219538311769618355029)); + u = mad(u, t, RV(-0.090908995008245008229153)); + u = mad(u, t, RV(0.111111105648261418443745)); + u = mad(u, t, RV(-0.14285714266771329383765)); + u = mad(u, t, RV(0.199999999996591265594148)); + u = mad(u, t, RV(-0.333333333333311110369124)); + break; } - + + t = s + s * (t * u); + + t = ifthen(q0, RV(M_PI_2) - t, t); + t = copysign(t, q1); + + return t; +} + +// Note: the order of arguments is y, x, as is convention for atan2 +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_atan2(realvec_t y, realvec_t x) { + // Algorithm taken from SLEEF 2.80 + + realvec_t r = atan2k(fabs(y), x); + + r = mulsign(r, x); + + r = ifthen(isinf(x) || x == RV(0.0), + ifthen(isinf(x), RV(M_PI_2) - copysign(RV(M_PI_2), x), RV(M_PI_2)), + r); + + r = ifthen(isinf(y), + ifthen(isinf(x), RV(M_PI_2) - copysign(RV(M_PI_4), x), RV(M_PI_2)), + r); + + r = ifthen(y == RV(0.0), ifthen(signbit(x), RV(M_PI), RV(0.0)), r); + + const real_t nan = std::numeric_limits<real_t>::quiet_NaN(); + return ifthen(isnan(x) || isnan(y), RV(nan), mulsign(r, y)); +} + }; // namespace vecmathlib -#endif // #ifndef MATHFUNCS_ASIN_H +#endif // #ifndef MATHFUNCS_ASIN_H diff --git a/mathfuncs_asinh.h b/mathfuncs_asinh.h index c7be8eb..1197261 100644 --- a/mathfuncs_asinh.h +++ b/mathfuncs_asinh.h @@ -7,36 +7,31 @@ #include <cmath> +namespace vecmathlib { +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_asinh(realvec_t x) { + // Reduce range + realvec_t r = fabs(x); + r = log(r + sqrt(r * r + RV(1.0))); + r = copysign(r, x); + return r; +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_acosh(realvec_t x) { + return log(x + sqrt(x * x - RV(1.0))); +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_atanh(realvec_t x) { + // Reduce range + realvec_t r = fabs(x); + r = RV(0.5) * log((RV(1.0) + r) / (RV(1.0) - r)); + r = copysign(r, x); + return r; +} -namespace vecmathlib { - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_asinh(realvec_t x) - { - // Reduce range - realvec_t r = fabs(x); - r = log(r + sqrt(r*r + RV(1.0))); - r = copysign(r, x); - return r; - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_acosh(realvec_t x) - { - return log(x + sqrt(x*x - RV(1.0))); - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_atanh(realvec_t x) - { - // Reduce range - realvec_t r = fabs(x); - r = RV(0.5) * log((RV(1.0) + r) / (RV(1.0) - r)); - r = copysign(r, x); - return r; - } - }; // namespace vecmathlib -#endif // #ifndef MATHFUNCS_ASINH_H +#endif // #ifndef MATHFUNCS_ASINH_H diff --git a/mathfuncs_base.h b/mathfuncs_base.h index c685542..8545003 100644 --- a/mathfuncs_base.h +++ b/mathfuncs_base.h @@ -5,130 +5,127 @@ #include "floatprops.h" +namespace vecmathlib { +template <typename realvec_t> struct mathfuncs { + typedef floatprops<typename realvec_t::real_t> FP; + + typedef typename FP::real_t real_t; + typedef typename FP::int_t int_t; + typedef typename FP::uint_t uint_t; + + static int const size = realvec_t::size; + + // typedef realvec<real_t, size> realvec_t; + typedef typename realvec_t::intvec_t intvec_t; + typedef typename realvec_t::boolvec_t boolvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + // static real_t R(double a) { return real_t(a); } + // static int_t I(int a) { return int_t(a); } + // static uint_t U(int a) { return uint_t(a); } + // static realvec_t RV(real_t a) { return realvec_t(a); } + // static intvec_t IV(int_t a) { return intvec_t(a); } + // static boolvec_t BV(bool a) { return boolvec_t(a); } + + // int + static intvec_t vml_abs(intvec_t x); + static intvec_t vml_bitifthen(intvec_t x, intvec_t y, intvec_t z); + static intvec_t vml_clz(intvec_t x); + static boolvec_t vml_isignbit(intvec_t x); + static intvec_t vml_max(intvec_t x, intvec_t y); + static intvec_t vml_min(intvec_t x, intvec_t y); + static intvec_t vml_popcount(intvec_t x); + static intvec_t vml_rotate(intvec_t x, int_t n); + static intvec_t vml_rotate(intvec_t x, intvec_t n); + + // asin + static realvec_t vml_acos(realvec_t x); + static realvec_t vml_asin(realvec_t x); + static realvec_t vml_atan(realvec_t x); + static realvec_t vml_atan2(realvec_t y, realvec_t x); + + // asinh + static realvec_t vml_acosh(realvec_t x); + static realvec_t vml_asinh(realvec_t x); + static realvec_t vml_atanh(realvec_t x); + + // convert + static realvec_t vml_antitrunc(realvec_t x); + static realvec_t vml_ceil(realvec_t x); + static realvec_t vml_convert_float(intvec_t x); + static intvec_t vml_convert_int(realvec_t x); + static realvec_t vml_floor(realvec_t x); + static intvec_t vml_lrint(realvec_t x); + static realvec_t vml_rint(realvec_t x); + static realvec_t vml_round(realvec_t x); + static realvec_t vml_nextafter(realvec_t x, realvec_t y); + static realvec_t vml_trunc(realvec_t x); + + // fabs + static realvec_t vml_copysign(realvec_t x, realvec_t y); + static realvec_t vml_fabs(realvec_t x); + static realvec_t vml_fdim(realvec_t x, realvec_t y); + static realvec_t vml_fma(realvec_t x, realvec_t y, realvec_t z); + static realvec_t vml_fmax(realvec_t x, realvec_t y); + static realvec_t vml_fmin(realvec_t x, realvec_t y); + static realvec_t vml_frexp(realvec_t x, intvec_t *r); + static intvec_t vml_ilogb(realvec_t x); + static boolvec_t vml_ieee_isfinite(realvec_t x); + static boolvec_t vml_ieee_isinf(realvec_t x); + static boolvec_t vml_ieee_isnan(realvec_t x); + static boolvec_t vml_ieee_isnormal(realvec_t x); + static boolvec_t vml_isfinite(realvec_t x); + static boolvec_t vml_isinf(realvec_t x); + static boolvec_t vml_isnan(realvec_t x); + static boolvec_t vml_isnormal(realvec_t x); + static realvec_t vml_ldexp(realvec_t x, intvec_t n); + static realvec_t vml_mad(realvec_t x, realvec_t y, realvec_t z); + static boolvec_t vml_signbit(realvec_t x); + + // exp + static realvec_t vml_exp(realvec_t x); + static realvec_t vml_exp10(realvec_t x); + static realvec_t vml_exp2(realvec_t x); + static realvec_t vml_expm1(realvec_t x); + + // log + static realvec_t vml_log(realvec_t x); + static realvec_t vml_log10(realvec_t x); + static realvec_t vml_log1p(realvec_t x); + static realvec_t vml_log2(realvec_t x); + + // pow + static realvec_t vml_pow(realvec_t x, realvec_t y); + + // rcp + static realvec_t vml_fmod(realvec_t x, realvec_t y); + static realvec_t vml_rcp(realvec_t x); + static realvec_t vml_remainder(realvec_t x, realvec_t y); + + // sin + static realvec_t vml_cos(realvec_t x); + static realvec_t vml_sin(realvec_t x); + static realvec_t vml_tan(realvec_t x); + + // sinh + static realvec_t vml_cosh(realvec_t x); + static realvec_t vml_sinh(realvec_t x); + static realvec_t vml_tanh(realvec_t x); + + // sqrt + static realvec_t vml_cbrt(realvec_t x); + static realvec_t vml_hypot(realvec_t x, realvec_t y); + static realvec_t vml_rsqrt(realvec_t x); + static realvec_t vml_sqrt(realvec_t x); +}; -namespace vecmathlib { - - template<typename realvec_t> - struct mathfuncs { - typedef floatprops<typename realvec_t::real_t> FP; - - typedef typename FP::real_t real_t; - typedef typename FP::int_t int_t; - typedef typename FP::uint_t uint_t; - - static int const size = realvec_t::size; - - // typedef realvec<real_t, size> realvec_t; - typedef typename realvec_t::intvec_t intvec_t; - typedef typename realvec_t::boolvec_t boolvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - // static real_t R(double a) { return real_t(a); } - // static int_t I(int a) { return int_t(a); } - // static uint_t U(int a) { return uint_t(a); } - // static realvec_t RV(real_t a) { return realvec_t(a); } - // static intvec_t IV(int_t a) { return intvec_t(a); } - // static boolvec_t BV(bool a) { return boolvec_t(a); } - - // int - static intvec_t vml_abs(intvec_t x); - static intvec_t vml_bitifthen(intvec_t x, intvec_t y, intvec_t z); - static intvec_t vml_clz(intvec_t x); - static boolvec_t vml_isignbit(intvec_t x); - static intvec_t vml_max(intvec_t x, intvec_t y); - static intvec_t vml_min(intvec_t x, intvec_t y); - static intvec_t vml_popcount(intvec_t x); - static intvec_t vml_rotate(intvec_t x, int_t n); - static intvec_t vml_rotate(intvec_t x, intvec_t n); - - // asin - static realvec_t vml_acos(realvec_t x); - static realvec_t vml_asin(realvec_t x); - static realvec_t vml_atan(realvec_t x); - static realvec_t vml_atan2(realvec_t y, realvec_t x); - - // asinh - static realvec_t vml_acosh(realvec_t x); - static realvec_t vml_asinh(realvec_t x); - static realvec_t vml_atanh(realvec_t x); - - // convert - static realvec_t vml_antitrunc(realvec_t x); - static realvec_t vml_ceil(realvec_t x); - static realvec_t vml_convert_float(intvec_t x); - static intvec_t vml_convert_int(realvec_t x); - static realvec_t vml_floor(realvec_t x); - static intvec_t vml_lrint(realvec_t x); - static realvec_t vml_rint(realvec_t x); - static realvec_t vml_round(realvec_t x); - static realvec_t vml_nextafter(realvec_t x, realvec_t y); - static realvec_t vml_trunc(realvec_t x); - - // fabs - static realvec_t vml_copysign(realvec_t x, realvec_t y); - static realvec_t vml_fabs(realvec_t x); - static realvec_t vml_fdim(realvec_t x, realvec_t y); - static realvec_t vml_fma(realvec_t x, realvec_t y, realvec_t z); - static realvec_t vml_fmax(realvec_t x, realvec_t y); - static realvec_t vml_fmin(realvec_t x, realvec_t y); - static realvec_t vml_frexp(realvec_t x, intvec_t* r); - static intvec_t vml_ilogb(realvec_t x); - static boolvec_t vml_ieee_isfinite(realvec_t x); - static boolvec_t vml_ieee_isinf(realvec_t x); - static boolvec_t vml_ieee_isnan(realvec_t x); - static boolvec_t vml_ieee_isnormal(realvec_t x); - static boolvec_t vml_isfinite(realvec_t x); - static boolvec_t vml_isinf(realvec_t x); - static boolvec_t vml_isnan(realvec_t x); - static boolvec_t vml_isnormal(realvec_t x); - static realvec_t vml_ldexp(realvec_t x, intvec_t n); - static realvec_t vml_mad(realvec_t x, realvec_t y, realvec_t z); - static boolvec_t vml_signbit(realvec_t x); - - // exp - static realvec_t vml_exp(realvec_t x); - static realvec_t vml_exp10(realvec_t x); - static realvec_t vml_exp2(realvec_t x); - static realvec_t vml_expm1(realvec_t x); - - // log - static realvec_t vml_log(realvec_t x); - static realvec_t vml_log10(realvec_t x); - static realvec_t vml_log1p(realvec_t x); - static realvec_t vml_log2(realvec_t x); - - // pow - static realvec_t vml_pow(realvec_t x, realvec_t y); - - // rcp - static realvec_t vml_fmod(realvec_t x, realvec_t y); - static realvec_t vml_rcp(realvec_t x); - static realvec_t vml_remainder(realvec_t x, realvec_t y); - - // sin - static realvec_t vml_cos(realvec_t x); - static realvec_t vml_sin(realvec_t x); - static realvec_t vml_tan(realvec_t x); - - // sinh - static realvec_t vml_cosh(realvec_t x); - static realvec_t vml_sinh(realvec_t x); - static realvec_t vml_tanh(realvec_t x); - - // sqrt - static realvec_t vml_cbrt(realvec_t x); - static realvec_t vml_hypot(realvec_t x, realvec_t y); - static realvec_t vml_rsqrt(realvec_t x); - static realvec_t vml_sqrt(realvec_t x); - }; - } // namespace vecmathlib -#endif // #ifndef MATHFUNCS_BASE_H +#endif // #ifndef MATHFUNCS_BASE_H diff --git a/mathfuncs_convert.h b/mathfuncs_convert.h index 79befbc..9cb1add 100644 --- a/mathfuncs_convert.h +++ b/mathfuncs_convert.h @@ -7,197 +7,179 @@ #include <cmath> +namespace vecmathlib { +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_convert_float(intvec_t x) { + // Convert in two passes. Convert as much as possible during the + // first pass (lobits), so that the second pass (hibits) may be + // omitted if the high bits are known to be zero. + int_t lobits = FP::mantissa_bits; + // int_t hibits = FP::bits - lobits; -namespace vecmathlib { - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_convert_float(intvec_t x) - { - // Convert in two passes. Convert as much as possible during the - // first pass (lobits), so that the second pass (hibits) may be - // omitted if the high bits are known to be zero. - int_t lobits = FP::mantissa_bits; - // int_t hibits = FP::bits - lobits; - - // Convert lower bits - intvec_t xlo = x & IV((U(1) << lobits) - 1); - // exponent for the equivalent floating point number - int_t exponent_lo = (FP::exponent_offset + lobits) << FP::mantissa_bits; - xlo |= exponent_lo; - // subtract hidden mantissa bit - realvec_t flo = as_float(xlo) - RV(FP::as_float(exponent_lo)); - - // Convert upper bits - // make unsigned by subtracting largest negative number - // (only do this for the high bits, since they have sufficient - // precision to handle the overflow) - x ^= FP::signbit_mask; - intvec_t xhi = lsr(x, lobits); - // exponent for the equivalent floating point number - int_t exponent_hi = (FP::exponent_offset + 2*lobits) << FP::mantissa_bits; - xhi |= exponent_hi; - // subtract hidden mantissa bit - realvec_t fhi = as_float(xhi) - RV(FP::as_float(exponent_hi)); - // add largest negative number again - fhi -= RV(R(FP::signbit_mask)); - // Ensure that the converted low and high bits are calculated - // separately, since a real_t doesn't have enough precision to - // hold all the bits of an int_t - fhi.barrier(); - - // Combine results - return flo + fhi; - } - - - - template<typename realvec_t> - typename realvec_t::intvec_t - mathfuncs<realvec_t>::vml_convert_int(realvec_t x) - { - // Handle overflow - // int_t min_int = FP::signbit_mask; - // int_t max_int = ~FP::signbit_mask; - // boolvec_t is_overflow = x < RV(R(min_int)) || x > RV(R(max_int)); - // Handle negative numbers - boolvec_t is_negative = signbit(x); - x = fabs(x); - // Handle small numbers - boolvec_t issmall = x < RV(1.0); - - intvec_t shift = ilogb(x) - IV(FP::mantissa_bits); - boolvec_t shift_left = x > RV(std::ldexp(R(1.0), FP::mantissa_bits)); - intvec_t ix = as_int(x) & IV(FP::mantissa_mask); - // add hidden mantissa bit - ix |= U(1) << FP::mantissa_bits; - // shift according to exponent (which may truncate) - ix = ifthen(shift_left, ix << shift, ix >> -shift); - - // Handle small numbers - ix = ifthen(issmall, IV(I(0)), ix); - // Handle negative numbers - ix = ifthen(is_negative, -ix, ix); - // Handle overflow - // ix = ifthen(is_overflow, IV(min_int), ix); - - return ix; - } - - - - // Round to nearest integer, breaking ties using prevailing rounding - // mode (default: round to even) - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_rint(realvec_t x) - { - realvec_t r = x; - // Round by adding a large number, destroying all excess precision - realvec_t offset = copysign(RV(std::ldexp(R(1.0), FP::mantissa_bits)), x); - r += offset; - // Ensure the rounding is not optimised away - r.barrier(); - r -= offset; - return r; - } - - // Round to next integer above - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_ceil(realvec_t x) - { - // boolvec_t iszero = x == RV(0.0); - // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits)); - // return ifthen(iszero, x, rint(x + offset)); - return ifthen(x<RV(0.0), trunc(x), vml_antitrunc(x)); - } - - // Round to next integer below - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_floor(realvec_t x) - { - // boolvec_t iszero = x == RV(0.0); - // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits)); - // return ifthen(iszero, x, rint(x - offset)); - return ifthen(x<RV(0.0), vml_antitrunc(x), trunc(x)); - } - - // Round to nearest integer, breaking ties using prevailing rounding - // mode (default: round to even), returning an integer - template<typename realvec_t> - typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_lrint(realvec_t x) - { - return convert_int(rint(x)); - } - - // Round to nearest integer, breaking ties away from zero - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_round(realvec_t x) - { - // return copysign(floor(fabs(x)+RV(0.5)), x); - return trunc(x + copysign(RV(0.5), x)); - } - - // Round to next integer towards zero - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_trunc(realvec_t x) - { - realvec_t x0 = x; - x = fabs(x); - boolvec_t istoosmall = x < RV(1.0); - boolvec_t istoolarge = x >= RV(std::ldexp(R(1.0), FP::mantissa_bits)); - // Number of mantissa bits to keep - intvec_t nbits = ilogb(x); - // This is probably faster than a shift operation - realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0); - intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask); - realvec_t y = as_float(as_int(x) & imask); - realvec_t r = + // Convert lower bits + intvec_t xlo = x & IV((U(1) << lobits) - 1); + // exponent for the equivalent floating point number + int_t exponent_lo = (FP::exponent_offset + lobits) << FP::mantissa_bits; + xlo |= exponent_lo; + // subtract hidden mantissa bit + realvec_t flo = as_float(xlo) - RV(FP::as_float(exponent_lo)); + + // Convert upper bits + // make unsigned by subtracting largest negative number + // (only do this for the high bits, since they have sufficient + // precision to handle the overflow) + x ^= FP::signbit_mask; + intvec_t xhi = lsr(x, lobits); + // exponent for the equivalent floating point number + int_t exponent_hi = (FP::exponent_offset + 2 * lobits) << FP::mantissa_bits; + xhi |= exponent_hi; + // subtract hidden mantissa bit + realvec_t fhi = as_float(xhi) - RV(FP::as_float(exponent_hi)); + // add largest negative number again + fhi -= RV(R(FP::signbit_mask)); + // Ensure that the converted low and high bits are calculated + // separately, since a real_t doesn't have enough precision to + // hold all the bits of an int_t + fhi.barrier(); + + // Combine results + return flo + fhi; +} + +template <typename realvec_t> +typename realvec_t::intvec_t +mathfuncs<realvec_t>::vml_convert_int(realvec_t x) { + // Handle overflow + // int_t min_int = FP::signbit_mask; + // int_t max_int = ~FP::signbit_mask; + // boolvec_t is_overflow = x < RV(R(min_int)) || x > RV(R(max_int)); + // Handle negative numbers + boolvec_t is_negative = signbit(x); + x = fabs(x); + // Handle small numbers + boolvec_t issmall = x < RV(1.0); + + intvec_t shift = ilogb(x) - IV(FP::mantissa_bits); + boolvec_t shift_left = x > RV(std::ldexp(R(1.0), FP::mantissa_bits)); + intvec_t ix = as_int(x) & IV(FP::mantissa_mask); + // add hidden mantissa bit + ix |= U(1) << FP::mantissa_bits; + // shift according to exponent (which may truncate) + ix = ifthen(shift_left, ix << shift, ix >> -shift); + + // Handle small numbers + ix = ifthen(issmall, IV(I(0)), ix); + // Handle negative numbers + ix = ifthen(is_negative, -ix, ix); + // Handle overflow + // ix = ifthen(is_overflow, IV(min_int), ix); + + return ix; +} + +// Round to nearest integer, breaking ties using prevailing rounding +// mode (default: round to even) +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_rint(realvec_t x) { + realvec_t r = x; + // Round by adding a large number, destroying all excess precision + realvec_t offset = copysign(RV(std::ldexp(R(1.0), FP::mantissa_bits)), x); + r += offset; + // Ensure the rounding is not optimised away + r.barrier(); + r -= offset; + return r; +} + +// Round to next integer above +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_ceil(realvec_t x) { + // boolvec_t iszero = x == RV(0.0); + // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits)); + // return ifthen(iszero, x, rint(x + offset)); + return ifthen(x < RV(0.0), trunc(x), vml_antitrunc(x)); +} + +// Round to next integer below +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_floor(realvec_t x) { + // boolvec_t iszero = x == RV(0.0); + // realvec_t offset = RV(0.5) - ldexp(fabs(x), I(-FP::mantissa_bits)); + // return ifthen(iszero, x, rint(x - offset)); + return ifthen(x < RV(0.0), vml_antitrunc(x), trunc(x)); +} + +// Round to nearest integer, breaking ties using prevailing rounding +// mode (default: round to even), returning an integer +template <typename realvec_t> +typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_lrint(realvec_t x) { + return convert_int(rint(x)); +} + +// Round to nearest integer, breaking ties away from zero +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_round(realvec_t x) { + // return copysign(floor(fabs(x)+RV(0.5)), x); + return trunc(x + copysign(RV(0.5), x)); +} + +// Round to next integer towards zero +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_trunc(realvec_t x) { + realvec_t x0 = x; + x = fabs(x); + boolvec_t istoosmall = x < RV(1.0); + boolvec_t istoolarge = x >= RV(std::ldexp(R(1.0), FP::mantissa_bits)); + // Number of mantissa bits to keep + intvec_t nbits = ilogb(x); + // This is probably faster than a shift operation + realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0); + intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask); + realvec_t y = as_float(as_int(x) & imask); + realvec_t r = copysign(ifthen(istoosmall, RV(0.0), ifthen(istoolarge, x, y)), x0); - return r; - } - - // Round to next integer away from zero - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_antitrunc(realvec_t x) - { - realvec_t x0 = x; - x = fabs(x); - boolvec_t iszero = x == RV(0.0); - boolvec_t issmall = x <= RV(1.0); - boolvec_t istoolarge = - x > RV(std::ldexp(R(1.0), FP::mantissa_bits) - R(1.0)); - // Number of mantissa bits to keep - intvec_t nbits = ilogb(x); - // This is probably faster than a shift operation - realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0); - intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask); - realvec_t offset = RV(1.0) - ldexp(RV(1.0), nbits - IV(FP::mantissa_bits)); - offset.barrier(); - realvec_t y = as_float(as_int(x + offset) & imask); - realvec_t r = + return r; +} + +// Round to next integer away from zero +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_antitrunc(realvec_t x) { + realvec_t x0 = x; + x = fabs(x); + boolvec_t iszero = x == RV(0.0); + boolvec_t issmall = x <= RV(1.0); + boolvec_t istoolarge = x > RV(std::ldexp(R(1.0), FP::mantissa_bits) - R(1.0)); + // Number of mantissa bits to keep + intvec_t nbits = ilogb(x); + // This is probably faster than a shift operation + realvec_t mask = ldexp(RV(2.0), nbits) - RV(1.0); + intvec_t imask = IV(FP::signbit_mask | FP::exponent_mask) | as_int(mask); + realvec_t offset = RV(1.0) - ldexp(RV(1.0), nbits - IV(FP::mantissa_bits)); + offset.barrier(); + realvec_t y = as_float(as_int(x + offset) & imask); + realvec_t r = copysign(ifthen(iszero, RV(0.0), - ifthen(issmall, RV(1.0), - ifthen(istoolarge, x, y))), x0); - return r; - } - - // Next machine representable number from x in direction y - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_nextafter(realvec_t x, realvec_t y) - { - realvec_t dir = y - x; - realvec_t offset = ldexp(RV(FP::epsilon()), ilogb(x)); - offset = copysign(offset, dir); - offset = ifthen(convert_bool(as_int(x) & IV(FP::mantissa_mask)) || - signbit(x) == signbit(offset), - offset, - offset * RV(0.5)); - realvec_t r = x + offset; - real_t smallest_pos = std::ldexp(FP::min(), -FP::mantissa_bits); - return ifthen(dir==RV(0.0), y, - ifthen(x==RV(0.0), copysign(RV(smallest_pos), dir), r)); - } - + ifthen(issmall, RV(1.0), ifthen(istoolarge, x, y))), + x0); + return r; +} + +// Next machine representable number from x in direction y +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_nextafter(realvec_t x, realvec_t y) { + realvec_t dir = y - x; + realvec_t offset = ldexp(RV(FP::epsilon()), ilogb(x)); + offset = copysign(offset, dir); + offset = ifthen(convert_bool(as_int(x) & IV(FP::mantissa_mask)) || + signbit(x) == signbit(offset), + offset, offset * RV(0.5)); + realvec_t r = x + offset; + real_t smallest_pos = std::ldexp(FP::min(), -FP::mantissa_bits); + return ifthen(dir == RV(0.0), y, + ifthen(x == RV(0.0), copysign(RV(smallest_pos), dir), r)); +} + }; // namespace vecmathlib -#endif // #ifndef MATHFUNCS_CONVERT_H +#endif // #ifndef MATHFUNCS_CONVERT_H diff --git a/mathfuncs_exp.h b/mathfuncs_exp.h index d357a21..e35fb1b 100644 --- a/mathfuncs_exp.h +++ b/mathfuncs_exp.h @@ -7,156 +7,145 @@ #include <cmath> +namespace vecmathlib { +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_exp2(realvec_t x) { + // TODO: Check SLEEF 2.80 algorithm + // (in particular the improved-precision truncation) + + // Rescale + realvec_t x0 = x; + +// realvec_t round_x = rint(x); +// intvec_t iround_x = convert_int(round_x); +// r = ldexp(r, iround_x); -namespace vecmathlib { - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_exp2(realvec_t x) - { - // TODO: Check SLEEF 2.80 algorithm - // (in particular the improved-precision truncation) - - // Rescale - realvec_t x0 = x; - - // realvec_t round_x = rint(x); - // intvec_t iround_x = convert_int(round_x); - // r = ldexp(r, iround_x); - #if 0 // Straightforward implementation realvec_t round_x = rint(x); x -= round_x; #elif 1 - // Round by adding, then subtracting again a large number - // Add a large number to move the mantissa bits to the right - int_t large = (U(1) << FP::mantissa_bits) + FP::exponent_offset; - realvec_t tmp = x + RV(R(large)); - tmp.barrier(); - - realvec_t round_x = tmp - RV(R(large)); - x -= round_x; + // Round by adding, then subtracting again a large number + // Add a large number to move the mantissa bits to the right + int_t large = (U(1) << FP::mantissa_bits) + FP::exponent_offset; + realvec_t tmp = x + RV(R(large)); + tmp.barrier(); + + realvec_t round_x = tmp - RV(R(large)); + x -= round_x; #else - // Straightforward implementation, using round instead of rint, - // since round is faster for QPX - realvec_t round_x = round(x); - x -= round_x; + // Straightforward implementation, using round instead of rint, + // since round is faster for QPX + realvec_t round_x = round(x); + x -= round_x; #endif - VML_ASSERT(all(x >= RV(-0.5) && x <= RV(0.5))); - - // Polynomial expansion - realvec_t r; - switch (sizeof(real_t)) { - case 4: + VML_ASSERT(all(x >= RV(-0.5) && x <= RV(0.5))); + + // Polynomial expansion + realvec_t r; + switch (sizeof(real_t)) { + case 4: #ifdef VML_HAVE_FP_CONTRACT - // float, error=4.55549108005200277750378992345e-9 - r = RV(0.000154653240842602623787395880898); - r = mad(r, x, RV(0.00133952915439234389712105060319)); - r = mad(r, x, RV(0.0096180399118156827664944870552)); - r = mad(r, x, RV(0.055503406540531310853149866446)); - r = mad(r, x, RV(0.240226511015459465468737123346)); - r = mad(r, x, RV(0.69314720007380208630542805293)); - r = mad(r, x, RV(0.99999999997182023878745628977)); + // float, error=4.55549108005200277750378992345e-9 + r = RV(0.000154653240842602623787395880898); + r = mad(r, x, RV(0.00133952915439234389712105060319)); + r = mad(r, x, RV(0.0096180399118156827664944870552)); + r = mad(r, x, RV(0.055503406540531310853149866446)); + r = mad(r, x, RV(0.240226511015459465468737123346)); + r = mad(r, x, RV(0.69314720007380208630542805293)); + r = mad(r, x, RV(0.99999999997182023878745628977)); #else - // float, error=1.62772721960621336664735896836e-7 - r = RV(0.00133952915439234389712105060319); - r = mad(r, x, RV(0.009670773148229417605024318985)); - r = mad(r, x, RV(0.055503406540531310853149866446)); - r = mad(r, x, RV(0.240222115700585316818177639177)); - r = mad(r, x, RV(0.69314720007380208630542805293)); - r = mad(r, x, RV(1.00000005230745711373079206024)); + // float, error=1.62772721960621336664735896836e-7 + r = RV(0.00133952915439234389712105060319); + r = mad(r, x, RV(0.009670773148229417605024318985)); + r = mad(r, x, RV(0.055503406540531310853149866446)); + r = mad(r, x, RV(0.240222115700585316818177639177)); + r = mad(r, x, RV(0.69314720007380208630542805293)); + r = mad(r, x, RV(1.00000005230745711373079206024)); #endif - break; - case 8: + break; + case 8: #ifdef VML_HAVE_FP_CONTRACT - // double, error=9.32016781355638010975628074746e-18 - r = RV(4.45623165388261696886670014471e-10); - r = mad(r, x, RV(7.0733589360775271430968224806e-9)); - r = mad(r, x, RV(1.01780540270960163558119510246e-7)); - r = mad(r, x, RV(1.3215437348041505269462510712e-6)); - r = mad(r, x, RV(0.000015252733849766201174247690629)); - r = mad(r, x, RV(0.000154035304541242555115696403795)); - r = mad(r, x, RV(0.00133335581463968601407096905671)); - r = mad(r, x, RV(0.0096181291075949686712855561931)); - r = mad(r, x, RV(0.055504108664821672870565883052)); - r = mad(r, x, RV(0.240226506959101382690753994082)); - r = mad(r, x, RV(0.69314718055994530864272481773)); - r = mad(r, x, RV(0.9999999999999999978508676375)); + // double, error=9.32016781355638010975628074746e-18 + r = RV(4.45623165388261696886670014471e-10); + r = mad(r, x, RV(7.0733589360775271430968224806e-9)); + r = mad(r, x, RV(1.01780540270960163558119510246e-7)); + r = mad(r, x, RV(1.3215437348041505269462510712e-6)); + r = mad(r, x, RV(0.000015252733849766201174247690629)); + r = mad(r, x, RV(0.000154035304541242555115696403795)); + r = mad(r, x, RV(0.00133335581463968601407096905671)); + r = mad(r, x, RV(0.0096181291075949686712855561931)); + r = mad(r, x, RV(0.055504108664821672870565883052)); + r = mad(r, x, RV(0.240226506959101382690753994082)); + r = mad(r, x, RV(0.69314718055994530864272481773)); + r = mad(r, x, RV(0.9999999999999999978508676375)); #else - // double, error=3.74939899823302048807873981077e-14 - r = RV(1.02072375599725694063203809188e-7); - r = mad(r, x, RV(1.32573274434801314145133004073e-6)); - r = mad(r, x, RV(0.0000152526647170731944840736190013)); - r = mad(r, x, RV(0.000154034441925859828261898614555)); - r = mad(r, x, RV(0.00133335582175770747495287552557)); - r = mad(r, x, RV(0.0096181291794939392517233403183)); - r = mad(r, x, RV(0.055504108664525029438908798685)); - r = mad(r, x, RV(0.240226506957026959772247598695)); - r = mad(r, x, RV(0.6931471805599487321347668143)); - r = mad(r, x, RV(1.00000000000000942892870993489)); + // double, error=3.74939899823302048807873981077e-14 + r = RV(1.02072375599725694063203809188e-7); + r = mad(r, x, RV(1.32573274434801314145133004073e-6)); + r = mad(r, x, RV(0.0000152526647170731944840736190013)); + r = mad(r, x, RV(0.000154034441925859828261898614555)); + r = mad(r, x, RV(0.00133335582175770747495287552557)); + r = mad(r, x, RV(0.0096181291794939392517233403183)); + r = mad(r, x, RV(0.055504108664525029438908798685)); + r = mad(r, x, RV(0.240226506957026959772247598695)); + r = mad(r, x, RV(0.6931471805599487321347668143)); + r = mad(r, x, RV(1.00000000000000942892870993489)); #endif - break; - default: - __builtin_unreachable(); - } - - // Undo rescaling + break; + default: + __builtin_unreachable(); + } + +// Undo rescaling #if 0 // Straightforward implementation r = ldexp(r, convert_int(round_x)); #elif 1 - // Use direct integer manipulation - // Extract integer as lowest mantissa bits (highest bits still - // contain offset, exponent, and sign) - intvec_t itmp = as_int(tmp); - // Construct scale factor by setting exponent (this shifts out the - // highest bits) - realvec_t scale = as_float(itmp << I(FP::mantissa_bits)); - r *= scale; + // Use direct integer manipulation + // Extract integer as lowest mantissa bits (highest bits still + // contain offset, exponent, and sign) + intvec_t itmp = as_int(tmp); + // Construct scale factor by setting exponent (this shifts out the + // highest bits) + realvec_t scale = as_float(itmp << I(FP::mantissa_bits)); + r *= scale; #else - // Use floating point operations instead of integer operations, - // since these are faster for QPX - real_t exponent_factor = R(I(1) << I(FP::mantissa_bits)); - real_t exponent_offset = R(I(FP::exponent_offset) << I(FP::mantissa_bits)); - realvec_t exponent = mad(round_x, RV(exponent_factor), RV(exponent_offset)); - realvec_t scale = as_float(convert_int(exponent)); - r *= scale; + // Use floating point operations instead of integer operations, + // since these are faster for QPX + real_t exponent_factor = R(I(1) << I(FP::mantissa_bits)); + real_t exponent_offset = R(I(FP::exponent_offset) << I(FP::mantissa_bits)); + realvec_t exponent = mad(round_x, RV(exponent_factor), RV(exponent_offset)); + realvec_t scale = as_float(convert_int(exponent)); + r *= scale; #endif - - r = ifthen(x0 < RV(R(FP::min_exponent)), RV(0.0), r); - - return r; - } - - - - template<typename realvec_t> - inline - realvec_t mathfuncs<realvec_t>::vml_exp(realvec_t x) - { - return exp2(RV(M_LOG2E) * x); - } - template<typename realvec_t> - inline - realvec_t mathfuncs<realvec_t>::vml_exp10(realvec_t x) - { - return exp2(RV(M_LOG2E * M_LN10) * x); - } + r = ifthen(x0 < RV(R(FP::min_exponent)), RV(0.0), r); + + return r; +} - template<typename realvec_t> - inline - realvec_t mathfuncs<realvec_t>::vml_expm1(realvec_t x) - { - // TODO: improve this - return exp(x) - RV(1.0); +template <typename realvec_t> +inline realvec_t mathfuncs<realvec_t>::vml_exp(realvec_t x) { + return exp2(RV(M_LOG2E) * x); +} + +template <typename realvec_t> +inline realvec_t mathfuncs<realvec_t>::vml_exp10(realvec_t x) { + return exp2(RV(M_LOG2E * M_LN10) * x); +} + +template <typename realvec_t> +inline realvec_t mathfuncs<realvec_t>::vml_expm1(realvec_t x) { + // TODO: improve this + return exp(x) - RV(1.0); #if 0 r = exp(x) - RV(1.0); return ifthen(r == RV(0.0), x, r); #endif - } - +} + }; // namespace vecmathlib -#endif // #ifndef MATHFUNCS_EXP_H +#endif // #ifndef MATHFUNCS_EXP_H diff --git a/mathfuncs_fabs.h b/mathfuncs_fabs.h index 4f31dec..c3f7356 100644 --- a/mathfuncs_fabs.h +++ b/mathfuncs_fabs.h @@ -7,201 +7,176 @@ #include <cmath> +namespace vecmathlib { +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y) { + intvec_t value = as_int(x) & IV(U(~FP::signbit_mask)); + intvec_t sign = as_int(y) & IV(FP::signbit_mask); + return as_float(sign | value); +} -namespace vecmathlib { - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y) - { - intvec_t value = as_int(x) & IV(U(~FP::signbit_mask)); - intvec_t sign = as_int(y) & IV(FP::signbit_mask); - return as_float(sign | value); - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x) - { - return as_float(as_int(x) & IV(U(~FP::signbit_mask))); - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_fdim(realvec_t x, realvec_t y) - { - // return ifthen(x > y, x - y, RV(0.0)); - return fmax(x - y, RV(0.0)); - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_fma(realvec_t x, realvec_t y, realvec_t z) - { - return x * y + z; - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_fmax(realvec_t x, realvec_t y) - { - return ifthen(x < y, y, x); - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_fmin(realvec_t x, realvec_t y) - { - return ifthen(y < x, y, x); - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_frexp(realvec_t x, - typename realvec_t::intvec_t* irp) - { - intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits); - intvec_t ir = e - IV(FP::exponent_offset - 1); - ir = ifthen(convert_bool(e), ir, IV(std::numeric_limits<int_t>::min())); +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x) { + return as_float(as_int(x) & IV(U(~FP::signbit_mask))); +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_fdim(realvec_t x, realvec_t y) { + // return ifthen(x > y, x - y, RV(0.0)); + return fmax(x - y, RV(0.0)); +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_fma(realvec_t x, realvec_t y, realvec_t z) { + return x * y + z; +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_fmax(realvec_t x, realvec_t y) { + return ifthen(x < y, y, x); +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_fmin(realvec_t x, realvec_t y) { + return ifthen(y < x, y, x); +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_frexp(realvec_t x, + typename realvec_t::intvec_t *irp) { + intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits); + intvec_t ir = e - IV(FP::exponent_offset - 1); + ir = ifthen(convert_bool(e), ir, IV(std::numeric_limits<int_t>::min())); #if defined VML_HAVE_INF - ir = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), ir); + ir = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), ir); #endif #if defined VML_HAVE_NAN - ir = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), ir); + ir = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), ir); #endif - realvec_t r = + realvec_t r = as_float((as_int(x) & IV(FP::signbit_mask | FP::mantissa_mask)) | IV(FP::as_int(R(0.5)) & FP::exponent_mask)); - boolvec_t iszero = x == RV(0.0); - ir = ifthen(iszero, IV(I(0)), ir); - r = ifthen(iszero, copysign(RV(R(0.0)), r), r); - *irp = ir; - return r; - } - - template<typename realvec_t> - typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_ilogb(realvec_t x) - { - // TODO: Check SLEEF 2.80 algorithm - intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits); - intvec_t r = e - IV(FP::exponent_offset); - r = ifthen(convert_bool(e), r, IV(std::numeric_limits<int_t>::min())); + boolvec_t iszero = x == RV(0.0); + ir = ifthen(iszero, IV(I(0)), ir); + r = ifthen(iszero, copysign(RV(R(0.0)), r), r); + *irp = ir; + return r; +} + +template <typename realvec_t> +typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_ilogb(realvec_t x) { + // TODO: Check SLEEF 2.80 algorithm + intvec_t e = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits); + intvec_t r = e - IV(FP::exponent_offset); + r = ifthen(convert_bool(e), r, IV(std::numeric_limits<int_t>::min())); #if defined VML_HAVE_INF - r = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), r); + r = ifthen(isinf(x), IV(std::numeric_limits<int_t>::max()), r); #endif #if defined VML_HAVE_NAN - r = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), r); + r = ifthen(isnan(x), IV(std::numeric_limits<int_t>::min()), r); #endif - return r; - } - - template<typename realvec_t> - typename realvec_t::boolvec_t - mathfuncs<realvec_t>::vml_ieee_isfinite(realvec_t x) - { - return (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask); - } - - template<typename realvec_t> - typename realvec_t::boolvec_t - mathfuncs<realvec_t>::vml_ieee_isinf(realvec_t x) - { - return (as_int(x) & IV(I(~FP::signbit_mask))) == IV(FP::exponent_mask); - } - - template<typename realvec_t> - typename realvec_t::boolvec_t - mathfuncs<realvec_t>::vml_ieee_isnan(realvec_t x) - { - return - (as_int(x) & IV(FP::exponent_mask)) == IV(FP::exponent_mask) && - (as_int(x) & IV(FP::mantissa_mask)) != IV(I(0)); - } - - template<typename realvec_t> - typename realvec_t::boolvec_t - mathfuncs<realvec_t>::vml_ieee_isnormal(realvec_t x) - { - return - (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask) && - (as_int(x) & IV(FP::exponent_mask)) != IV(I(0)); - } - - template<typename realvec_t> - typename realvec_t::boolvec_t - mathfuncs<realvec_t>::vml_isfinite(realvec_t x) - { + return r; +} + +template <typename realvec_t> +typename realvec_t::boolvec_t +mathfuncs<realvec_t>::vml_ieee_isfinite(realvec_t x) { + return (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask); +} + +template <typename realvec_t> +typename realvec_t::boolvec_t +mathfuncs<realvec_t>::vml_ieee_isinf(realvec_t x) { + return (as_int(x) & IV(I(~FP::signbit_mask))) == IV(FP::exponent_mask); +} + +template <typename realvec_t> +typename realvec_t::boolvec_t +mathfuncs<realvec_t>::vml_ieee_isnan(realvec_t x) { + return (as_int(x) & IV(FP::exponent_mask)) == IV(FP::exponent_mask) && + (as_int(x) & IV(FP::mantissa_mask)) != IV(I(0)); +} + +template <typename realvec_t> +typename realvec_t::boolvec_t +mathfuncs<realvec_t>::vml_ieee_isnormal(realvec_t x) { + return (as_int(x) & IV(FP::exponent_mask)) != IV(FP::exponent_mask) && + (as_int(x) & IV(FP::exponent_mask)) != IV(I(0)); +} + +template <typename realvec_t> +typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isfinite(realvec_t x) { #if defined VML_HAVE_INF || defined VML_HAVE_NAN - return vml_ieee_isfinite(x); + return vml_ieee_isfinite(x); #else - return BV(true); + return BV(true); #endif - } - - template<typename realvec_t> - typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isinf(realvec_t x) - { +} + +template <typename realvec_t> +typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isinf(realvec_t x) { #if defined VML_HAVE_INF - return vml_ieee_isinf(x); + return vml_ieee_isinf(x); #else - return BV(false); + return BV(false); #endif - } - - template<typename realvec_t> - typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnan(realvec_t x) - { +} + +template <typename realvec_t> +typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnan(realvec_t x) { #if defined VML_HAVE_NAN - return vml_ieee_isnan(x); + return vml_ieee_isnan(x); #else - return BV(false); + return BV(false); #endif - } - - template<typename realvec_t> - typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnormal(realvec_t x) - { +} + +template <typename realvec_t> +typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isnormal(realvec_t x) { #if defined VML_HAVE_DENORMALS || defined VML_HAVE_INF || defined VML_HAVE_NAN - return vml_ieee_isnormal(x); + return vml_ieee_isnormal(x); #else - return BV(true); + return BV(true); #endif - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_ldexp(realvec_t x, intvec_t n) - { - // TODO: Check SLEEF 2.80 algorithm +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_ldexp(realvec_t x, intvec_t n) { +// TODO: Check SLEEF 2.80 algorithm #if 0 realvec_t r = as_float(as_int(x) + (n << I(FP::mantissa_bits))); r = ifthen((as_int(x) & IV(FP::exponent_mask)) == IV(I(0)), x, r); return r; #endif - realvec_t r = as_float(as_int(x) + (n << U(FP::mantissa_bits))); - int max_n = FP::max_exponent - FP::min_exponent; - boolvec_t underflow = n < IV(I(-max_n)); - boolvec_t overflow = n > IV(I(max_n)); - intvec_t old_exp = - lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits); - intvec_t new_exp = old_exp + n; - // TODO: check bit patterns instead - underflow = + realvec_t r = as_float(as_int(x) + (n << U(FP::mantissa_bits))); + int max_n = FP::max_exponent - FP::min_exponent; + boolvec_t underflow = n < IV(I(-max_n)); + boolvec_t overflow = n > IV(I(max_n)); + intvec_t old_exp = lsr(as_int(x) & IV(FP::exponent_mask), FP::mantissa_bits); + intvec_t new_exp = old_exp + n; + // TODO: check bit patterns instead + underflow = underflow || new_exp < IV(I(FP::min_exponent + FP::exponent_offset)); - overflow = + overflow = overflow || new_exp > IV(I(FP::max_exponent + FP::exponent_offset)); - r = ifthen(underflow, copysign(RV(R(0.0)), x), r); - r = ifthen(overflow, copysign(RV(FP::infinity()), x), r); - boolvec_t dont_change = x == RV(R(0.0)) || isinf(x) || isnan(x); - r = ifthen(dont_change, x, r); - return r; - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_mad(realvec_t x, realvec_t y, realvec_t z) - { - return x * y + z; - } - - template<typename realvec_t> - typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_signbit(realvec_t x) - { - return convert_bool(as_int(x) & IV(FP::signbit_mask)); - } - + r = ifthen(underflow, copysign(RV(R(0.0)), x), r); + r = ifthen(overflow, copysign(RV(FP::infinity()), x), r); + boolvec_t dont_change = x == RV(R(0.0)) || isinf(x) || isnan(x); + r = ifthen(dont_change, x, r); + return r; +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_mad(realvec_t x, realvec_t y, realvec_t z) { + return x * y + z; +} + +template <typename realvec_t> +typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_signbit(realvec_t x) { + return convert_bool(as_int(x) & IV(FP::signbit_mask)); +} + }; // namespace vecmathlib -#endif // #ifndef MATHFUNCS_FABS_H +#endif // #ifndef MATHFUNCS_FABS_H diff --git a/mathfuncs_int.h b/mathfuncs_int.h index 862189d..fff65ff 100644 --- a/mathfuncs_int.h +++ b/mathfuncs_int.h @@ -7,129 +7,128 @@ #include <climits> +namespace vecmathlib { +template <typename realvec_t> +typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_abs(intvec_t x) { + return ifthen(isignbit(x), -x, x); +} + +template <typename realvec_t> +typename realvec_t::intvec_t +mathfuncs<realvec_t>::vml_bitifthen(intvec_t x, intvec_t y, intvec_t z) { + return (x & y) | (~x & z); +} + +template <typename realvec_t> +typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_clz(intvec_t x) { + // These implementations return 8*sizeof(TYPE) when the input is 0 + + // These explicit implementations are taken from + // <http://aggregate.org/MAGIC/>: + // + // @techreport{magicalgorithms, + // author={Henry Gordon Dietz}, + // title={{The Aggregate Magic Algorithms}}, + // institution={University of Kentucky}, + // howpublished={Aggregate.Org online technical report}, + // date={2013-03-25}, + // URL={http://aggregate.org/MAGIC/} + // } + + int_t bits = CHAR_BIT * sizeof(int_t); + if (bits > 1) + x |= lsr(x, 1); + if (bits > 2) + x |= lsr(x, 2); + if (bits > 4) + x |= lsr(x, 4); + if (bits > 8) + x |= lsr(x, 8); + if (bits > 16) + x |= lsr(x, 16); + if (bits > 32) + x |= lsr(x, 32); + if (bits > 64) + x |= lsr(x, 64); + assert(bits <= 128); + return IV(I(bits)) - popcount(x); +} + +template <typename realvec_t> +typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isignbit(intvec_t x) { + return x < IV(I(0)); +} + +template <typename realvec_t> +typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_max(intvec_t x, + intvec_t y) { + return ifthen(x >= y, x, y); +} + +template <typename realvec_t> +typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_min(intvec_t x, + intvec_t y) { + return ifthen(x < y, x, y); +} + +template <typename realvec_t> +typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_popcount(intvec_t x) { + // These explicit implementations are taken from + // <http://aggregate.org/MAGIC/>: + // + // @techreport{magicalgorithms, + // author={Henry Gordon Dietz}, + // title={{The Aggregate Magic Algorithms}}, + // institution={University of Kentucky}, + // howpublished={Aggregate.Org online technical report}, + // date={2013-03-25}, + // URL={http://aggregate.org/MAGIC/} + // } + + int_t bits = CHAR_BIT * sizeof(int_t); + + // intvec_t x55 = IV(FP::replicate_byte(0x55)); + // intvec_t x33 = IV(FP::replicate_byte(0x33)); + // intvec_t x0f = IV(FP::replicate_byte(0x0f)); + intvec_t x55 = I(~U(0) / U(3)); // 0x0101... + intvec_t x33 = I(~U(0) / U(5)); // 0x00110011... + intvec_t x0f = I(~U(0) / U(17)); // 0b0000111100001111... + + x -= lsr(x, I(1)) & x55; + x = (x & x33) + (lsr(x, I(2)) & x33); + x += lsr(x, I(4)); + x &= x0f; + if (bits > 8) + x += lsr(x, I(8)); + if (bits > 16) + x += lsr(x, I(16)); + if (bits > 32) + x += lsr(x, I(32)); + if (bits > 64) + x += lsr(x, I(64)); + assert(bits <= 128); + return x & IV(I(0xff)); +} + +template <typename realvec_t> +typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x, + int_t n) { + int_t mask = CHAR_BIT * sizeof(int_t) - 1; + intvec_t left = x << (n & mask); + intvec_t right = lsr(x, -n & mask); + return left | right; +} + +template <typename realvec_t> +typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x, + intvec_t n) { + intvec_t mask = IV(I(CHAR_BIT * sizeof(int_t) - 1)); + intvec_t left = x << (n & mask); + intvec_t right = lsr(x, -n & mask); + return left | right; +} -namespace vecmathlib { - - template<typename realvec_t> - typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_abs(intvec_t x) - { - return ifthen(isignbit(x), -x, x); - } - - template<typename realvec_t> - typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_bitifthen(intvec_t x, - intvec_t y, - intvec_t z) - { - return (x & y) | (~x & z); - } - - template<typename realvec_t> - typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_clz(intvec_t x) - { - // These implementations return 8*sizeof(TYPE) when the input is 0 - - // These explicit implementations are taken from - // <http://aggregate.org/MAGIC/>: - // - // @techreport{magicalgorithms, - // author={Henry Gordon Dietz}, - // title={{The Aggregate Magic Algorithms}}, - // institution={University of Kentucky}, - // howpublished={Aggregate.Org online technical report}, - // date={2013-03-25}, - // URL={http://aggregate.org/MAGIC/} - // } - - int_t bits = CHAR_BIT * sizeof(int_t); - if (bits > 1) x |= lsr(x, 1); - if (bits > 2) x |= lsr(x, 2); - if (bits > 4) x |= lsr(x, 4); - if (bits > 8) x |= lsr(x, 8); - if (bits > 16) x |= lsr(x, 16); - if (bits > 32) x |= lsr(x, 32); - if (bits > 64) x |= lsr(x, 64); - assert(bits<=128); - return IV(I(bits)) - popcount(x); - } - - template<typename realvec_t> - typename realvec_t::boolvec_t mathfuncs<realvec_t>::vml_isignbit(intvec_t x) - { - return x < IV(I(0)); - } - - template<typename realvec_t> - typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_max(intvec_t x, - intvec_t y) - { - return ifthen(x>=y, x, y); - } - - template<typename realvec_t> - typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_min(intvec_t x, - intvec_t y) - { - return ifthen(x<y, x, y); - } - - template<typename realvec_t> - typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_popcount(intvec_t x) - { - // These explicit implementations are taken from - // <http://aggregate.org/MAGIC/>: - // - // @techreport{magicalgorithms, - // author={Henry Gordon Dietz}, - // title={{The Aggregate Magic Algorithms}}, - // institution={University of Kentucky}, - // howpublished={Aggregate.Org online technical report}, - // date={2013-03-25}, - // URL={http://aggregate.org/MAGIC/} - // } - - int_t bits = CHAR_BIT * sizeof(int_t); - - // intvec_t x55 = IV(FP::replicate_byte(0x55)); - // intvec_t x33 = IV(FP::replicate_byte(0x33)); - // intvec_t x0f = IV(FP::replicate_byte(0x0f)); - intvec_t x55 = I(~U(0) / U(3)); // 0x0101... - intvec_t x33 = I(~U(0) / U(5)); // 0x00110011... - intvec_t x0f = I(~U(0) / U(17)); // 0b0000111100001111... - - x -= lsr(x, I(1)) & x55; - x = (x & x33) + (lsr(x, I(2)) & x33); - x += lsr(x, I(4)); - x &= x0f; - if (bits > 8) x += lsr(x, I(8)); - if (bits > 16) x += lsr(x, I(16)); - if (bits > 32) x += lsr(x, I(32)); - if (bits > 64) x += lsr(x, I(64)); - assert(bits<=128); - return x & IV(I(0xff)); - } - - template<typename realvec_t> - typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x, - int_t n) - { - int_t mask = CHAR_BIT * sizeof(int_t) - 1; - intvec_t left = x << (n & mask); - intvec_t right = lsr(x, -n & mask); - return left | right; - } - - template<typename realvec_t> - typename realvec_t::intvec_t mathfuncs<realvec_t>::vml_rotate(intvec_t x, - intvec_t n) - { - intvec_t mask = IV(I(CHAR_BIT * sizeof(int_t) - 1)); - intvec_t left = x << (n & mask); - intvec_t right = lsr(x, -n & mask); - return left | right; - } - }; // namespace vecmathlib -#endif // #ifndef MATHFUNCS_ASIN_H +#endif // #ifndef MATHFUNCS_ASIN_H diff --git a/mathfuncs_log.h b/mathfuncs_log.h index cd71eb3..fa517ba 100644 --- a/mathfuncs_log.h +++ b/mathfuncs_log.h @@ -7,93 +7,82 @@ #include <cmath> +namespace vecmathlib { +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_log2(realvec_t x) { + // Algorithm inspired by SLEEF 2.80 -namespace vecmathlib { - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_log2(realvec_t x) - { - // Algorithm inspired by SLEEF 2.80 - - // Rescale - intvec_t ilogb_x = ilogb(x * RV(M_SQRT2)); - x = ldexp(x, -ilogb_x); - VML_ASSERT(all(x >= RV(M_SQRT1_2) && x <= RV(M_SQRT2))); - - realvec_t y = (x - RV(1.0)) / (x + RV(1.0)); - realvec_t y2 = y*y; - - realvec_t r; - switch (sizeof(real_t)) { - case 4: - // float, error=7.09807175879142775648452461821e-8 - r = RV(0.59723611417135718739797302426); - r = mad(r, y2, RV(0.961524413175528426101613434)); - r = mad(r, y2, RV(2.88539097665498228703236701)); - break; - case 8: + // Rescale + intvec_t ilogb_x = ilogb(x * RV(M_SQRT2)); + x = ldexp(x, -ilogb_x); + VML_ASSERT(all(x >= RV(M_SQRT1_2) && x <= RV(M_SQRT2))); + + realvec_t y = (x - RV(1.0)) / (x + RV(1.0)); + realvec_t y2 = y * y; + + realvec_t r; + switch (sizeof(real_t)) { + case 4: + // float, error=7.09807175879142775648452461821e-8 + r = RV(0.59723611417135718739797302426); + r = mad(r, y2, RV(0.961524413175528426101613434)); + r = mad(r, y2, RV(2.88539097665498228703236701)); + break; + case 8: #ifdef VML_HAVE_FP_CONTRACT - // double, error=1.48294180185938512675770096324e-16 - r = RV(0.243683403415639178527756320773); - r = mad(r, y2, RV(0.26136626803870009948502658)); - r = mad(r, y2, RV(0.320619429891299265439389)); - r = mad(r, y2, RV(0.4121983452028499242926)); - r = mad(r, y2, RV(0.577078017761894161436)); - r = mad(r, y2, RV(0.96179669392233355927)); - r = mad(r, y2, RV(2.8853900817779295236)); + // double, error=1.48294180185938512675770096324e-16 + r = RV(0.243683403415639178527756320773); + r = mad(r, y2, RV(0.26136626803870009948502658)); + r = mad(r, y2, RV(0.320619429891299265439389)); + r = mad(r, y2, RV(0.4121983452028499242926)); + r = mad(r, y2, RV(0.577078017761894161436)); + r = mad(r, y2, RV(0.96179669392233355927)); + r = mad(r, y2, RV(2.8853900817779295236)); #else - // double, error=2.1410114030383689267772704676e-14 - r = RV(0.283751646449323373643963474845); - r = mad(r, y2, RV(0.31983138095551191299118812)); - r = mad(r, y2, RV(0.412211603844146279666022)); - r = mad(r, y2, RV(0.5770779098948940070516)); - r = mad(r, y2, RV(0.961796694295973716912)); - r = mad(r, y2, RV(2.885390081777562819196)); + // double, error=2.1410114030383689267772704676e-14 + r = RV(0.283751646449323373643963474845); + r = mad(r, y2, RV(0.31983138095551191299118812)); + r = mad(r, y2, RV(0.412211603844146279666022)); + r = mad(r, y2, RV(0.5770779098948940070516)); + r = mad(r, y2, RV(0.961796694295973716912)); + r = mad(r, y2, RV(2.885390081777562819196)); #endif - break; - default: - __builtin_unreachable(); - } - r *= y; - - // Undo rescaling - r += convert_float(ilogb_x); - - return r; - } - - - - template<typename realvec_t> - inline - realvec_t mathfuncs<realvec_t>::vml_log(realvec_t x) - { - return log2(x) * RV(M_LN2); + break; + default: + __builtin_unreachable(); } + r *= y; - template<typename realvec_t> - inline - realvec_t mathfuncs<realvec_t>::vml_log10(realvec_t x) - { - return log(x) * RV(M_LOG10E); - } + // Undo rescaling + r += convert_float(ilogb_x); - template<typename realvec_t> - inline - realvec_t mathfuncs<realvec_t>::vml_log1p(realvec_t x) - { - // TODO: Check SLEEF 2.80 algorithm - - return log(RV(1.0) + x); + return r; +} + +template <typename realvec_t> +inline realvec_t mathfuncs<realvec_t>::vml_log(realvec_t x) { + return log2(x) * RV(M_LN2); +} + +template <typename realvec_t> +inline realvec_t mathfuncs<realvec_t>::vml_log10(realvec_t x) { + return log(x) * RV(M_LOG10E); +} + +template <typename realvec_t> +inline realvec_t mathfuncs<realvec_t>::vml_log1p(realvec_t x) { + // TODO: Check SLEEF 2.80 algorithm + + return log(RV(1.0) + x); #if 0 // Goldberg, theorem 4 realvec_t x1 = RV(1.0) + x; x1.barrier(); return ifthen(x1 == x, x, x * log(x1) / (x1 - RV(1.0))); #endif - } - +} + }; // namespace vecmathlib -#endif // #ifndef MATHFUNCS_LOG_H +#endif // #ifndef MATHFUNCS_LOG_H diff --git a/mathfuncs_pow.h b/mathfuncs_pow.h index b863570..70bcc80 100644 --- a/mathfuncs_pow.h +++ b/mathfuncs_pow.h @@ -7,30 +7,27 @@ #include <cmath> +namespace vecmathlib { +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_pow(realvec_t x, realvec_t y) { + // Handle zero + boolvec_t is_zero = x == RV(0.0); + x = ifthen(is_zero, RV(1.0), x); + + realvec_t r = exp(log(fabs(x)) * y); + + // The result is negative if x<0 and if y is integer and odd + realvec_t mod_y = fabs(y) - RV(2.0) * floor(RV(0.5) * fabs(y)); + realvec_t sign = copysign(mod_y, x) + RV(0.5); + r = copysign(r, sign); + + // Handle zero + r = ifthen(is_zero, RV(0.0), r); + + return r; +} -namespace vecmathlib { - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_pow(realvec_t x, realvec_t y) - { - // Handle zero - boolvec_t is_zero = x == RV(0.0); - x = ifthen(is_zero, RV(1.0), x); - - realvec_t r = exp(log(fabs(x)) * y); - - // The result is negative if x<0 and if y is integer and odd - realvec_t mod_y = fabs(y) - RV(2.0) * floor(RV(0.5) * fabs(y)); - realvec_t sign = copysign(mod_y, x) + RV(0.5); - r = copysign(r, sign); - - // Handle zero - r = ifthen(is_zero, RV(0.0), r); - - return r; - } - }; // namespace vecmathlib -#endif // #ifndef MATHFUNCS_POW_H +#endif // #ifndef MATHFUNCS_POW_H diff --git a/mathfuncs_rcp.h b/mathfuncs_rcp.h index 6e12b27..f703454 100644 --- a/mathfuncs_rcp.h +++ b/mathfuncs_rcp.h @@ -7,10 +7,8 @@ #include <cmath> - - namespace vecmathlib { - + #if 0 // This routine works, but may be slower than the one below template<typename realvec_t> @@ -50,66 +48,61 @@ namespace vecmathlib { return r; } #endif - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_rcp(realvec_t x) - { - // Handle negative values - realvec_t x0 = x; - x = fabs(x); - - // <https://en.wikipedia.org/wiki/Division_algorithm> [2013-06-28] - - // Initial guess + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_rcp(realvec_t x) { + // Handle negative values + realvec_t x0 = x; + x = fabs(x); + + // <https://en.wikipedia.org/wiki/Division_algorithm> [2013-06-28] + + // Initial guess + VML_ASSERT(all(x > RV(0.0))); + intvec_t x_exp; + x = frexp(x, &x_exp); + VML_ASSERT(all(x >= RV(0.5) && x < RV(1.0))); + realvec_t r = RV(R(48.0) / R(17.0)) - RV(R(32.0) / R(17.0)) * x; + + // Iterate + int const nmax = sizeof(real_t) == 4 ? 3 : 4; + for (int n = 0; n < nmax; ++n) { + // Step VML_ASSERT(all(x > RV(0.0))); - intvec_t x_exp; - x = frexp(x, &x_exp); - VML_ASSERT(all(x >= RV(0.5) && x < RV(1.0))); - realvec_t r = RV(R(48.0)/R(17.0)) - RV(R(32.0)/R(17.0)) * x; - - // Iterate - int const nmax = sizeof(real_t)==4 ? 3 : 4; - for (int n=0; n<nmax; ++n) { - // Step - VML_ASSERT(all(x > RV(0.0))); - // Newton method: - // Solve f(r) = 0 for f(r) = x - 1/r - // r <- r - f(r) / f'(r) - // r <- 2 r - r^2 x - // r <- r + r (1 - r x) - - // Note: don't rewrite this expression, this may introduce - // cancellation errors - r += r * (RV(1.0) - x*r); - - // NEON: r = r * (RV(2.0) - x*r); - } - r = ldexp(r, -x_exp); - - // Handle negative values - r = copysign(r, x0); - - return r; - } - - - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_remainder(realvec_t x, realvec_t y) - { - return x - rint(x / y) * y; - // realvec_t r = x / y; - // return y * (r - rint(r)); - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_fmod(realvec_t x, realvec_t y) - { - return x - y * trunc(x / y); - // realvec_t r = x / y; - // return y * (r - trunc(r)); + // Newton method: + // Solve f(r) = 0 for f(r) = x - 1/r + // r <- r - f(r) / f'(r) + // r <- 2 r - r^2 x + // r <- r + r (1 - r x) + + // Note: don't rewrite this expression, this may introduce + // cancellation errors + r += r * (RV(1.0) - x * r); + + // NEON: r = r * (RV(2.0) - x*r); } - + r = ldexp(r, -x_exp); + + // Handle negative values + r = copysign(r, x0); + + return r; +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_remainder(realvec_t x, realvec_t y) { + return x - rint(x / y) * y; + // realvec_t r = x / y; + // return y * (r - rint(r)); +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_fmod(realvec_t x, realvec_t y) { + return x - y * trunc(x / y); + // realvec_t r = x / y; + // return y * (r - trunc(r)); +} + }; // namespace vecmathlib -#endif // #ifndef MATHFUNCS_RCP_H +#endif // #ifndef MATHFUNCS_RCP_H diff --git a/mathfuncs_sin.h b/mathfuncs_sin.h index 8e2afd9..72ffb6f 100644 --- a/mathfuncs_sin.h +++ b/mathfuncs_sin.h @@ -7,230 +7,227 @@ #include <cmath> +namespace vecmathlib { +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_sin(realvec_t d) { + // Algorithm taken from SLEEF 2.80 + + real_t PI4_A, PI4_B, PI4_C, PI4_D; + switch (sizeof(real_t)) { + default: + __builtin_unreachable(); + case sizeof(float): + PI4_A = 0.78515625f; + PI4_B = 0.00024187564849853515625f; + PI4_C = 3.7747668102383613586e-08f; + PI4_D = 1.2816720341285448015e-12f; + break; + case sizeof(double): + PI4_A = 0.78539816290140151978; + PI4_B = 4.9604678871439933374e-10; + PI4_C = 1.1258708853173288931e-18; + PI4_D = 1.7607799325916000908e-27; + break; + } + + realvec_t q = rint(d * RV(M_1_PI)); + intvec_t iq = convert_int(q); -namespace vecmathlib { - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_sin(realvec_t d) - { - // Algorithm taken from SLEEF 2.80 - - real_t PI4_A, PI4_B, PI4_C, PI4_D; - switch (sizeof(real_t)) { - default: __builtin_unreachable(); - case sizeof(float): - PI4_A = 0.78515625f; - PI4_B = 0.00024187564849853515625f; - PI4_C = 3.7747668102383613586e-08f; - PI4_D = 1.2816720341285448015e-12f; - break; - case sizeof(double): - PI4_A = 0.78539816290140151978; - PI4_B = 4.9604678871439933374e-10; - PI4_C = 1.1258708853173288931e-18; - PI4_D = 1.7607799325916000908e-27; - break; - } - - realvec_t q = rint(d * RV(M_1_PI)); - intvec_t iq = convert_int(q); - #ifdef VML_HAVE_FP_CONTRACT - d = mad(q, RV(-PI4_A*4), d); - d = mad(q, RV(-PI4_B*4), d); - d = mad(q, RV(-PI4_C*4), d); - d = mad(q, RV(-PI4_D*4), d); + d = mad(q, RV(-PI4_A * 4), d); + d = mad(q, RV(-PI4_B * 4), d); + d = mad(q, RV(-PI4_C * 4), d); + d = mad(q, RV(-PI4_D * 4), d); #else - d = mad(q, RV(-M_PI), d); + d = mad(q, RV(-M_PI), d); #endif - - realvec_t s = d * d; - - d = ifthen(convert_bool(iq & IV(I(1))), -d, d); - - realvec_t u; - switch (sizeof(real_t)) { - default: __builtin_unreachable(); - case sizeof(float): - u = RV(2.6083159809786593541503e-06f); - u = mad(u, s, RV(-0.0001981069071916863322258f)); - u = mad(u, s, RV(0.00833307858556509017944336f)); - u = mad(u, s, RV(-0.166666597127914428710938f)); - break; - case sizeof(double): - u = RV(-7.97255955009037868891952e-18); - u = mad(u, s, RV(2.81009972710863200091251e-15)); - u = mad(u, s, RV(-7.64712219118158833288484e-13)); - u = mad(u, s, RV(1.60590430605664501629054e-10)); - u = mad(u, s, RV(-2.50521083763502045810755e-08)); - u = mad(u, s, RV(2.75573192239198747630416e-06)); - u = mad(u, s, RV(-0.000198412698412696162806809)); - u = mad(u, s, RV(0.00833333333333332974823815)); - u = mad(u, s, RV(-0.166666666666666657414808)); - break; - } - - u = mad(s, u * d, d); - - const real_t nan = std::numeric_limits<real_t>::quiet_NaN(); - u = ifthen(isinf(d), RV(nan), u); - - return u; + + realvec_t s = d * d; + + d = ifthen(convert_bool(iq & IV(I(1))), -d, d); + + realvec_t u; + switch (sizeof(real_t)) { + default: + __builtin_unreachable(); + case sizeof(float): + u = RV(2.6083159809786593541503e-06f); + u = mad(u, s, RV(-0.0001981069071916863322258f)); + u = mad(u, s, RV(0.00833307858556509017944336f)); + u = mad(u, s, RV(-0.166666597127914428710938f)); + break; + case sizeof(double): + u = RV(-7.97255955009037868891952e-18); + u = mad(u, s, RV(2.81009972710863200091251e-15)); + u = mad(u, s, RV(-7.64712219118158833288484e-13)); + u = mad(u, s, RV(1.60590430605664501629054e-10)); + u = mad(u, s, RV(-2.50521083763502045810755e-08)); + u = mad(u, s, RV(2.75573192239198747630416e-06)); + u = mad(u, s, RV(-0.000198412698412696162806809)); + u = mad(u, s, RV(0.00833333333333332974823815)); + u = mad(u, s, RV(-0.166666666666666657414808)); + break; + } + + u = mad(s, u * d, d); + + const real_t nan = std::numeric_limits<real_t>::quiet_NaN(); + u = ifthen(isinf(d), RV(nan), u); + + return u; +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_cos(realvec_t d) { + // Algorithm taken from SLEEF 2.80 + + real_t PI4_A, PI4_B, PI4_C, PI4_D; + switch (sizeof(real_t)) { + default: + __builtin_unreachable(); + case sizeof(float): + PI4_A = 0.78515625f; + PI4_B = 0.00024187564849853515625f; + PI4_C = 3.7747668102383613586e-08f; + PI4_D = 1.2816720341285448015e-12f; + break; + case sizeof(double): + PI4_A = 0.78539816290140151978; + PI4_B = 4.9604678871439933374e-10; + PI4_C = 1.1258708853173288931e-18; + PI4_D = 1.7607799325916000908e-27; + break; } - - - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_cos(realvec_t d) - { - // Algorithm taken from SLEEF 2.80 - - real_t PI4_A, PI4_B, PI4_C, PI4_D; - switch (sizeof(real_t)) { - default: __builtin_unreachable(); - case sizeof(float): - PI4_A = 0.78515625f; - PI4_B = 0.00024187564849853515625f; - PI4_C = 3.7747668102383613586e-08f; - PI4_D = 1.2816720341285448015e-12f; - break; - case sizeof(double): - PI4_A = 0.78539816290140151978; - PI4_B = 4.9604678871439933374e-10; - PI4_C = 1.1258708853173288931e-18; - PI4_D = 1.7607799325916000908e-27; - break; - } - - realvec_t q = mad(RV(2.0), rint(mad(d, RV(M_1_PI), RV(-0.5))), RV(1.0)); - intvec_t iq = convert_int(q); - + + realvec_t q = mad(RV(2.0), rint(mad(d, RV(M_1_PI), RV(-0.5))), RV(1.0)); + intvec_t iq = convert_int(q); + #ifdef VML_HAVE_FP_CONTRACT - d = mad(q, RV(-PI4_A*2), d); - d = mad(q, RV(-PI4_B*2), d); - d = mad(q, RV(-PI4_C*2), d); - d = mad(q, RV(-PI4_D*2), d); + d = mad(q, RV(-PI4_A * 2), d); + d = mad(q, RV(-PI4_B * 2), d); + d = mad(q, RV(-PI4_C * 2), d); + d = mad(q, RV(-PI4_D * 2), d); #else - d = mad(q, RV(-M_PI_2), d); + d = mad(q, RV(-M_PI_2), d); #endif - - realvec_t s = d * d; - - d = ifthen(convert_bool(iq & IV(I(2))), d, -d); - - realvec_t u; - switch (sizeof(real_t)) { - default: __builtin_unreachable(); - case sizeof(float): - u = RV(2.6083159809786593541503e-06f); - u = mad(u, s, RV(-0.0001981069071916863322258f)); - u = mad(u, s, RV(0.00833307858556509017944336f)); - u = mad(u, s, RV(-0.166666597127914428710938f)); - break; - case sizeof(double): - u = RV(-7.97255955009037868891952e-18); - u = mad(u, s, RV(2.81009972710863200091251e-15)); - u = mad(u, s, RV(-7.64712219118158833288484e-13)); - u = mad(u, s, RV(1.60590430605664501629054e-10)); - u = mad(u, s, RV(-2.50521083763502045810755e-08)); - u = mad(u, s, RV(2.75573192239198747630416e-06)); - u = mad(u, s, RV(-0.000198412698412696162806809)); - u = mad(u, s, RV(0.00833333333333332974823815)); - u = mad(u, s, RV(-0.166666666666666657414808)); - break; - } - - u = mad(s, u * d, d); - - const real_t nan = std::numeric_limits<real_t>::quiet_NaN(); - u = ifthen(isinf(d), RV(nan), u); - - return u; + + realvec_t s = d * d; + + d = ifthen(convert_bool(iq & IV(I(2))), d, -d); + + realvec_t u; + switch (sizeof(real_t)) { + default: + __builtin_unreachable(); + case sizeof(float): + u = RV(2.6083159809786593541503e-06f); + u = mad(u, s, RV(-0.0001981069071916863322258f)); + u = mad(u, s, RV(0.00833307858556509017944336f)); + u = mad(u, s, RV(-0.166666597127914428710938f)); + break; + case sizeof(double): + u = RV(-7.97255955009037868891952e-18); + u = mad(u, s, RV(2.81009972710863200091251e-15)); + u = mad(u, s, RV(-7.64712219118158833288484e-13)); + u = mad(u, s, RV(1.60590430605664501629054e-10)); + u = mad(u, s, RV(-2.50521083763502045810755e-08)); + u = mad(u, s, RV(2.75573192239198747630416e-06)); + u = mad(u, s, RV(-0.000198412698412696162806809)); + u = mad(u, s, RV(0.00833333333333332974823815)); + u = mad(u, s, RV(-0.166666666666666657414808)); + break; } - - - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_tan(realvec_t d) - { - // Algorithm taken from SLEEF 2.80 - - real_t PI4_A, PI4_B, PI4_C, PI4_D; - switch (sizeof(real_t)) { - default: __builtin_unreachable(); - case sizeof(float): - PI4_A = 0.78515625f; - PI4_B = 0.00024187564849853515625f; - PI4_C = 3.7747668102383613586e-08f; - PI4_D = 1.2816720341285448015e-12f; - break; - case sizeof(double): - PI4_A = 0.78539816290140151978; - PI4_B = 4.9604678871439933374e-10; - PI4_C = 1.1258708853173288931e-18; - PI4_D = 1.7607799325916000908e-27; - break; - } - - realvec_t q = rint(d * RV(2 * M_1_PI)); - intvec_t iq = convert_int(q); - - realvec_t x = d; - + + u = mad(s, u * d, d); + + const real_t nan = std::numeric_limits<real_t>::quiet_NaN(); + u = ifthen(isinf(d), RV(nan), u); + + return u; +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_tan(realvec_t d) { + // Algorithm taken from SLEEF 2.80 + + real_t PI4_A, PI4_B, PI4_C, PI4_D; + switch (sizeof(real_t)) { + default: + __builtin_unreachable(); + case sizeof(float): + PI4_A = 0.78515625f; + PI4_B = 0.00024187564849853515625f; + PI4_C = 3.7747668102383613586e-08f; + PI4_D = 1.2816720341285448015e-12f; + break; + case sizeof(double): + PI4_A = 0.78539816290140151978; + PI4_B = 4.9604678871439933374e-10; + PI4_C = 1.1258708853173288931e-18; + PI4_D = 1.7607799325916000908e-27; + break; + } + + realvec_t q = rint(d * RV(2 * M_1_PI)); + intvec_t iq = convert_int(q); + + realvec_t x = d; + #ifdef VML_HAVE_FP_CONTRACT - x = mad(q, RV(-PI4_A*2), x); - x = mad(q, RV(-PI4_B*2), x); - x = mad(q, RV(-PI4_C*2), x); - x = mad(q, RV(-PI4_D*2), x); + x = mad(q, RV(-PI4_A * 2), x); + x = mad(q, RV(-PI4_B * 2), x); + x = mad(q, RV(-PI4_C * 2), x); + x = mad(q, RV(-PI4_D * 2), x); #else - x = mad(q, RV(-M_PI_2), x); + x = mad(q, RV(-M_PI_2), x); #endif - - realvec_t s = x * x; - - x = ifthen(convert_bool(iq & IV(I(1))), -x, x); - - realvec_t u; - switch (sizeof(real_t)) { - default: __builtin_unreachable(); - case sizeof(float): - u = RV(0.00927245803177356719970703f); - u = mad(u, s, RV(0.00331984995864331722259521f)); - u = mad(u, s, RV(0.0242998078465461730957031f)); - u = mad(u, s, RV(0.0534495301544666290283203f)); - u = mad(u, s, RV(0.133383005857467651367188f)); - u = mad(u, s, RV(0.333331853151321411132812f)); - break; - case sizeof(double): - u = RV(1.01419718511083373224408e-05); - u = mad(u, s, RV(-2.59519791585924697698614e-05)); - u = mad(u, s, RV(5.23388081915899855325186e-05)); - u = mad(u, s, RV(-3.05033014433946488225616e-05)); - u = mad(u, s, RV(7.14707504084242744267497e-05)); - u = mad(u, s, RV(8.09674518280159187045078e-05)); - u = mad(u, s, RV(0.000244884931879331847054404)); - u = mad(u, s, RV(0.000588505168743587154904506)); - u = mad(u, s, RV(0.00145612788922812427978848)); - u = mad(u, s, RV(0.00359208743836906619142924)); - u = mad(u, s, RV(0.00886323944362401618113356)); - u = mad(u, s, RV(0.0218694882853846389592078)); - u = mad(u, s, RV(0.0539682539781298417636002)); - u = mad(u, s, RV(0.133333333333125941821962)); - u = mad(u, s, RV(0.333333333333334980164153)); - break; - } - - u = mad(s, u * x, x); - - u = ifthen(convert_bool(iq & IV(I(1))), rcp(u), u); - - const real_t nan = std::numeric_limits<real_t>::quiet_NaN(); - u = ifthen(isinf(d), RV(nan), u); - - return u; + + realvec_t s = x * x; + + x = ifthen(convert_bool(iq & IV(I(1))), -x, x); + + realvec_t u; + switch (sizeof(real_t)) { + default: + __builtin_unreachable(); + case sizeof(float): + u = RV(0.00927245803177356719970703f); + u = mad(u, s, RV(0.00331984995864331722259521f)); + u = mad(u, s, RV(0.0242998078465461730957031f)); + u = mad(u, s, RV(0.0534495301544666290283203f)); + u = mad(u, s, RV(0.133383005857467651367188f)); + u = mad(u, s, RV(0.333331853151321411132812f)); + break; + case sizeof(double): + u = RV(1.01419718511083373224408e-05); + u = mad(u, s, RV(-2.59519791585924697698614e-05)); + u = mad(u, s, RV(5.23388081915899855325186e-05)); + u = mad(u, s, RV(-3.05033014433946488225616e-05)); + u = mad(u, s, RV(7.14707504084242744267497e-05)); + u = mad(u, s, RV(8.09674518280159187045078e-05)); + u = mad(u, s, RV(0.000244884931879331847054404)); + u = mad(u, s, RV(0.000588505168743587154904506)); + u = mad(u, s, RV(0.00145612788922812427978848)); + u = mad(u, s, RV(0.00359208743836906619142924)); + u = mad(u, s, RV(0.00886323944362401618113356)); + u = mad(u, s, RV(0.0218694882853846389592078)); + u = mad(u, s, RV(0.0539682539781298417636002)); + u = mad(u, s, RV(0.133333333333125941821962)); + u = mad(u, s, RV(0.333333333333334980164153)); + break; } - + + u = mad(s, u * x, x); + + u = ifthen(convert_bool(iq & IV(I(1))), rcp(u), u); + + const real_t nan = std::numeric_limits<real_t>::quiet_NaN(); + u = ifthen(isinf(d), RV(nan), u); + + return u; +} + }; // namespace vecmathlib -#endif // #ifndef MATHFUNCS_SIN_H +#endif // #ifndef MATHFUNCS_SIN_H diff --git a/mathfuncs_sinh.h b/mathfuncs_sinh.h index 04aa446..a8c2ee3 100644 --- a/mathfuncs_sinh.h +++ b/mathfuncs_sinh.h @@ -7,28 +7,23 @@ #include <cmath> +namespace vecmathlib { +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_cosh(realvec_t x) { + return RV(0.5) * (exp(x) + exp(-x)); +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_sinh(realvec_t x) { + return RV(0.5) * (exp(x) - exp(-x)); +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_tanh(realvec_t x) { + return sinh(x) / cosh(x); +} -namespace vecmathlib { - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_cosh(realvec_t x) - { - return RV(0.5) * (exp(x) + exp(-x)); - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_sinh(realvec_t x) - { - return RV(0.5) * (exp(x) - exp(-x)); - } - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_tanh(realvec_t x) - { - return sinh(x) / cosh(x); - } - }; // namespace vecmathlib -#endif // #ifndef MATHFUNCS_SINH_H +#endif // #ifndef MATHFUNCS_SINH_H diff --git a/mathfuncs_sqrt.h b/mathfuncs_sqrt.h index dea5fd6..7a362f9 100644 --- a/mathfuncs_sqrt.h +++ b/mathfuncs_sqrt.h @@ -7,13 +7,10 @@ #include <cmath> - - namespace vecmathlib { - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x) - { + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_sqrt(realvec_t x) { #if 0 // Handle special case: zero boolvec_t is_zero = x <= RV(0.0); @@ -49,29 +46,23 @@ namespace vecmathlib { // Handle special case: zero r = ifthen(is_zero, RV(0.0), r); #endif - - realvec_t r = x * rsqrt(x); - // Handle special case: zero - r = ifthen(x == RV(0.0), RV(0.0), r); - - return r; - } - - - - // TODO: Use "Halley's method with cubic convergence": - // <http://press.mcs.anl.gov/gswjanuary12/files/2012/01/Optimizing-Single-Node-Performance-on-BlueGene.pdf> - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_cbrt(realvec_t x) - { - return pow(x, RV(1.0/3.0)); - } - - - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x) - { + + realvec_t r = x * rsqrt(x); + // Handle special case: zero + r = ifthen(x == RV(0.0), RV(0.0), r); + + return r; +} + +// TODO: Use "Halley's method with cubic convergence": +// <http://press.mcs.anl.gov/gswjanuary12/files/2012/01/Optimizing-Single-Node-Performance-on-BlueGene.pdf> +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_cbrt(realvec_t x) { + return pow(x, RV(1.0 / 3.0)); +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_rsqrt(realvec_t x) { #if 0 // See <http://en.wikipedia.org/wiki/Fast_inverse_square_root> realvec_t x_2 = RV(0.5) * x; @@ -85,46 +76,43 @@ namespace vecmathlib { r += r * (RV(0.5) - (x_2 * r * r)); return r; #else - // Initial guess - // VML_ASSERT(all(x > RV(0.0))); - intvec_t ilogb_x = ilogb(x); - realvec_t s = + // Initial guess + // VML_ASSERT(all(x > RV(0.0))); + intvec_t ilogb_x = ilogb(x); + realvec_t s = ifthen(convert_bool(ilogb_x & IV(I(1))), RV(R(0.583)), RV(R(0.824))); - realvec_t r = ldexp(s, -(ilogb_x >> I(1))); - - realvec_t x_2 = RV(0.5) * x; - - // Iterate - // nmax iterations give an accuracy of 2^nmax binary digits. 5 - // iterations suffice for double precision with its 53 digits. - int const nmax = sizeof(real_t)==4 ? 4 : 5; - for (int n=0; n<nmax; ++n) { - // Step - VML_ASSERT(all(r > RV(0.0))); - // Newton method: - // Solve f(r) = 0 for f(r) = x - 1/r^2 - // r <- r - f(r) / f'(r) - // r <- (3 r - r^3 x) / 2 - // r <- r (3/2 - r^2 x/2) - - // Note: don't rewrite this expression, this may introduce - // cancellation errors (says who?) - // r *= RV(1.5) - x_2 * r*r; - r += r * (RV(0.5) - x_2 * r*r); - } - - return r; -#endif - } - - - - template<typename realvec_t> - realvec_t mathfuncs<realvec_t>::vml_hypot(realvec_t x, realvec_t y) - { - return sqrt(x*x + y*y); + realvec_t r = ldexp(s, -(ilogb_x >> I(1))); + + realvec_t x_2 = RV(0.5) * x; + + // Iterate + // nmax iterations give an accuracy of 2^nmax binary digits. 5 + // iterations suffice for double precision with its 53 digits. + int const nmax = sizeof(real_t) == 4 ? 4 : 5; + for (int n = 0; n < nmax; ++n) { + // Step + VML_ASSERT(all(r > RV(0.0))); + // Newton method: + // Solve f(r) = 0 for f(r) = x - 1/r^2 + // r <- r - f(r) / f'(r) + // r <- (3 r - r^3 x) / 2 + // r <- r (3/2 - r^2 x/2) + + // Note: don't rewrite this expression, this may introduce + // cancellation errors (says who?) + // r *= RV(1.5) - x_2 * r*r; + r += r * (RV(0.5) - x_2 * r * r); } - + + return r; +#endif +} + +template <typename realvec_t> +realvec_t mathfuncs<realvec_t>::vml_hypot(realvec_t x, realvec_t y) { + return sqrt(x * x + y * y); +} + }; // namespace vecmathlib -#endif // #ifndef MATHFUNCS_SQRT_H +#endif // #ifndef MATHFUNCS_SQRT_H diff --git a/selftest.cc b/selftest.cc index 4296f14..334d95f 100644 --- a/selftest.cc +++ b/selftest.cc @@ -14,22 +14,17 @@ using namespace std; - - int num_errors = 0; +template <typename realvec_t> struct vecmathlib_test { - -template<typename realvec_t> -struct vecmathlib_test { - typedef typename realvec_t::boolvec_t boolvec_t; typedef typename realvec_t::intvec_t intvec_t; - + typedef typename realvec_t::int_t int_t; typedef typename realvec_t::uint_t uint_t; typedef typename realvec_t::real_t real_t; - + // Short names for type casts typedef real_t R; typedef int_t I; @@ -37,16 +32,13 @@ struct vecmathlib_test { typedef realvec_t RV; typedef intvec_t IV; typedef boolvec_t BV; - + typedef vecmathlib::floatprops<real_t> FP; typedef vecmathlib::mathfuncs<realvec_t> MF; - - - + // Test each function with this many random values static const int imax = 10000; - static real_t accuracy(real_t ulp = R(0.5)) - { + static real_t accuracy(real_t ulp = R(0.5)) { #ifdef VML_HAVE_FP_CONTRACT // Require that 100% of the digits are correct // real_t digit_fraction = 1.0; @@ -56,526 +48,451 @@ struct vecmathlib_test { // Require that 80% of the digits are correct real_t digit_fraction = 0.8; #endif - digit_fraction *= 0.95; // some lenience for testing (why?) + digit_fraction *= 0.95; // some lenience for testing (why?) return pow(ulp * realvec_t::epsilon(), digit_fraction); } - - - - static realvec_t random(const real_t xmin, const real_t xmax) - { + + static realvec_t random(const real_t xmin, const real_t xmax) { realvec_t x; - for (int i=0; i<realvec_t::size; ++i) { - const real_t r = - (xmax - xmin) * FP::convert_float(rand()) / FP::convert_float(RAND_MAX); + for (int i = 0; i < realvec_t::size; ++i) { + const real_t r = (xmax - xmin) * FP::convert_float(rand()) / + FP::convert_float(RAND_MAX); x.set_elt(i, xmin + r); } return x; } - - static intvec_t random(const int_t nmin, const int_t nmax) - { + + static intvec_t random(const int_t nmin, const int_t nmax) { intvec_t n; - for (int i=0; i<intvec_t::size; ++i) { - const real_t r = - R(nmax - nmin + 1) * R(rand()) / (R(RAND_MAX) + R(1.0)); + for (int i = 0; i < intvec_t::size; ++i) { + const real_t r = R(nmax - nmin + 1) * R(rand()) / (R(RAND_MAX) + R(1.0)); n.set_elt(i, nmin + FP::convert_int(floor(r))); } return n; } - - - - static bool is_big_endian() - { + + static bool is_big_endian() { const int i = 1; unsigned char cs[sizeof i]; memcpy(cs, &i, sizeof i); - return cs[0]==0; + return cs[0] == 0; } - - template<typename T> - static string hex(const T x) - { + + template <typename T> static string hex(const T x) { unsigned char cs[sizeof x]; memcpy(cs, &x, sizeof x); ostringstream buf; buf << "0x"; - const char* const hexdigits = "0123456789abcdef"; + const char *const hexdigits = "0123456789abcdef"; const int n0 = is_big_endian() ? 0 : sizeof x - 1; const int dn = is_big_endian() ? +1 : -1; const int n1 = n0 + sizeof x * dn; - for (int n=n0; n!=n1; n+=dn) { - buf << hexdigits[cs[n]>>4] << hexdigits[cs[n]&15]; + for (int n = n0; n != n1; n += dn) { + buf << hexdigits[cs[n] >> 4] << hexdigits[cs[n] & 15]; } return buf.str(); } - - - - static boolvec_t supported(realvec_t x) - { - return x==RV(0.0) || MF::vml_ieee_isnormal(x) + + static boolvec_t supported(realvec_t x) { + return x == RV(0.0) || MF::vml_ieee_isnormal(x) #ifdef VML_HAVE_DENORMALS - || MF::vml_ieee_isfinite(x) + || MF::vml_ieee_isfinite(x) #endif #ifdef VML_HAVE_INF - || MF::vml_ieee_isinf(x) + || MF::vml_ieee_isinf(x) #endif #ifdef VML_HAVE_NAN - || MF::vml_ieee_isnan(x) + || MF::vml_ieee_isnan(x) #endif - ; - } - - static boolvec_t supported(intvec_t x) - { - return true; - } - - static boolvec_t supported(boolvec_t x) - { - return true; + ; } - - - + + static boolvec_t supported(intvec_t x) { return true; } + + static boolvec_t supported(boolvec_t x) { return true; } + // Check load memory access - static void check_mem(const char* const func, - const realvec_t x, - const real_t* const p, - const realvec_t xold, - const int mval) - { + static void check_mem(const char *const func, const realvec_t x, + const real_t *const p, const realvec_t xold, + const int mval) { realvec_t xwant; - for (int i=0; i<realvec_t::size; ++i) { - xwant.set_elt(i, mval & (1<<i) ? p[i] : xold[i]); + for (int i = 0; i < realvec_t::size; ++i) { + xwant.set_elt(i, mval & (1 << i) ? p[i] : xold[i]); } const boolvec_t isbad = x != xwant; if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " found=" << x << " [" << hex(x) << "]\n" << " expected=" << xwant << " [" << hex(xwant) << "]\n" << " mval=" << mval << " [" << hex(mval) << "]\n" - << " isbad=" << isbad << "\n" - << flush; + << " isbad=" << isbad << "\n" << flush; } } - + // Check store memory access - static void check_mem(const char* const func, - const real_t* const p, - const realvec_t x, - const real_t* const pold, - const int mval) - { + static void check_mem(const char *const func, const real_t *const p, + const realvec_t x, const real_t *const pold, + const int mval) { realvec_t pv, pvwant; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { pv.set_elt(i, p[i]); - pvwant.set_elt(i, mval & (1<<i) ? x[i] : pold[i]); + pvwant.set_elt(i, mval & (1 << i) ? x[i] : pold[i]); } const boolvec_t isbad = pv != pvwant; if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " found=" << pv << " [" << hex(pv) << "]\n" << " expected=" << pvwant << " [" << hex(pvwant) << "]\n" - << " isbad=" << isbad << "\n" - << flush; + << " isbad=" << isbad << "\n" << flush; } } - - static void check_bool(const char* const func, - const bool rstd, const bool rvml) - { + + static void check_bool(const char *const func, const bool rstd, + const bool rvml) { const bool dr = rstd ^ rvml; const bool isbad = dr; if (isbad) { - ++ num_errors; + ++num_errors; cout << "Error in " << func << ":\n" << " fstd()=" << rstd << " [" << hex(rstd) << "]\n" << " fvml()=" << rvml << " [" << hex(rvml) << "]\n" - << " isbad()=" << isbad << "\n" - << flush; + << " isbad()=" << isbad << "\n" << flush; } } - - template<typename A> - static void check_bool(const char* const func, - const bool rstd, const bool rvml, const A x) - { + + template <typename A> + static void check_bool(const char *const func, const bool rstd, + const bool rvml, const A x) { const bool dr = rstd ^ rvml; const bool isbad = dr; if (isbad) { - ++ num_errors; + ++num_errors; cout << "Error in " << func << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " fstd(x)=" << rstd << " [" << hex(rstd) << "]\n" << " fvml(x)=" << rvml << " [" << hex(rvml) << "]\n" - << " isbad(x)=" << isbad << "\n" - << flush; + << " isbad(x)=" << isbad << "\n" << flush; } } - - template<typename A> - static void check_bool(const char* const func, - const boolvec_t rstd, const boolvec_t rvml, - const A x) - { + + template <typename A> + static void check_bool(const char *const func, const boolvec_t rstd, + const boolvec_t rvml, const A x) { boolvec_t dr; bool isbad = false; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { dr.set_elt(i, rstd[i] ^ rvml[i]); isbad |= dr[i]; } if (isbad) { - ++ num_errors; + ++num_errors; cout << "Error in " << func << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " fstd(x)=" << rstd << " [" << hex(rstd) << "]\n" << " fvml(x)=" << rvml << " [" << hex(rvml) << "]\n" << " error(x)=" << dr << " [" << hex(rvml) << "]\n" - << " isbad(x)=" << isbad << "\n" - << flush; + << " isbad(x)=" << isbad << "\n" << flush; } } - - template<typename A, typename B> - static void check_bool(const char* const func, - const boolvec_t rstd, const boolvec_t rvml, - const A x, const B y) - { + + template <typename A, typename B> + static void check_bool(const char *const func, const boolvec_t rstd, + const boolvec_t rvml, const A x, const B y) { boolvec_t dr; bool isbad = false; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { dr.set_elt(i, rstd[i] ^ rvml[i]); isbad |= dr[i]; } if (isbad) { - ++ num_errors; + ++num_errors; cout << "Error in " << func << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " y=" << y << " [" << hex(y) << "]\n" << " fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n" << " fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n" << " error(x,y)=" << dr << " [" << hex(rvml) << "]\n" - << " isbad(x,y)=" << isbad << "\n" - << flush; + << " isbad(x,y)=" << isbad << "\n" << flush; } } - - template<typename A> - static void check_bool(const char* const func, - bool fstd(typename A::scalar_t x), - boolvec_t fvml(A x), - const A x) - { + + template <typename A> + static void check_bool(const char *const func, + bool fstd(typename A::scalar_t x), boolvec_t fvml(A x), + const A x) { boolvec_t rstd; - for (int i=0; i<boolvec_t::size; ++i) { + for (int i = 0; i < boolvec_t::size; ++i) { rstd.set_elt(i, fstd(x[i])); } const boolvec_t rvml = fvml(x); const boolvec_t dr = rstd != rvml; const boolvec_t isbad = supported(x) && supported(rstd) && dr; if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " fstd(x)=" << rstd << " [" << hex(rstd) << "]\n" << " fvml(x)=" << rvml << " [" << hex(rvml) << "]\n" << " error(x)=" << dr << " [" << hex(dr) << "]\n" - << " isbad(x)=" << isbad << "\n" - << flush; + << " isbad(x)=" << isbad << "\n" << flush; } } - - template<typename A, typename B> - static void check_bool(const char* const func, + + template <typename A, typename B> + static void check_bool(const char *const func, bool fstd(typename A::scalar_t x, typename B::scalar_t y), - boolvec_t fvml(A x, B y), - const A x, const B y) - { + boolvec_t fvml(A x, B y), const A x, const B y) { boolvec_t rstd; - for (int i=0; i<boolvec_t::size; ++i) { + for (int i = 0; i < boolvec_t::size; ++i) { rstd.set_elt(i, fstd(x[i], y[i])); } const boolvec_t rvml = fvml(x, y); const boolvec_t dr = rstd != rvml; const boolvec_t isbad = supported(x) && supported(rstd) && dr; if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " y=" << y << " [" << hex(y) << "]\n" << " fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n" << " fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n" << " error(x,y)=" << dr << " [" << hex(dr) << "]\n" - << " isbad(x,y)=" << isbad << "\n" - << flush; + << " isbad(x,y)=" << isbad << "\n" << flush; } } - - template<typename A, typename B, typename C> - static void check_bool(const char* const func, - bool fstd(typename A::scalar_t x, - typename B::scalar_t y, - typename C::scalar_t z), - boolvec_t fvml(A x, B y, C z), - const A x, const B y, const C z) - { + + template <typename A, typename B, typename C> + static void + check_bool(const char *const func, + bool fstd(typename A::scalar_t x, typename B::scalar_t y, + typename C::scalar_t z), + boolvec_t fvml(A x, B y, C z), const A x, const B y, const C z) { boolvec_t rstd; - for (int i=0; i<boolvec_t::size; ++i) { + for (int i = 0; i < boolvec_t::size; ++i) { rstd.set_elt(i, fstd(x[i], y[i], z[i])); } const boolvec_t rvml = fvml(x, y, z); const boolvec_t dr = rstd != rvml; const boolvec_t isbad = supported(x) && supported(rstd) && dr; if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " y=" << y << " [" << hex(y) << "]\n" << " z=" << z << " [" << hex(z) << "]\n" << " fstd(x,y,z)=" << rstd << " [" << hex(rstd) << "]\n" << " fvml(x,y,z)=" << rvml << " [" << hex(rvml) << "]\n" << " error(x,y,z)=" << dr << " [" << hex(dr) << "]\n" - << " isbad(x,y,z)=" << isbad << "\n" - << flush; + << " isbad(x,y,z)=" << isbad << "\n" << flush; } } - - static void check_int(const char* const func, - const int_t rstd, const int_t rvml) - { + + static void check_int(const char *const func, const int_t rstd, + const int_t rvml) { const int_t dr = rstd - rvml; const bool isbad = dr; if (isbad) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " fstd()=" << rstd << " [" << hex(rstd) << "]\n" << " fvml()=" << rvml << " [" << hex(rvml) << "]\n" << " error()=" << dr << " [" << hex(dr) << "]\n" - << " isbad()=" << isbad << "\n" - << flush; + << " isbad()=" << isbad << "\n" << flush; } } - - template<typename A> - static void check_int(const char* const func, - int_t fstd(typename A::scalar_t x), - intvec_t fvml(A x), - const A x) - { + + template <typename A> + static void check_int(const char *const func, + int_t fstd(typename A::scalar_t x), intvec_t fvml(A x), + const A x) { intvec_t rstd; - for (int i=0; i<intvec_t::size; ++i) { + for (int i = 0; i < intvec_t::size; ++i) { rstd.set_elt(i, fstd(x[i])); } const intvec_t rvml = fvml(x); const intvec_t dr = rstd - rvml; const boolvec_t isbad = supported(x) && supported(rstd) && convert_bool(dr); if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " fstd(x)=" << rstd << " [" << hex(rstd) << "]\n" << " fvml(x)=" << rvml << " [" << hex(rvml) << "]\n" << " error(x)=" << dr << " [" << hex(dr) << "]\n" - << " isbad(x)=" << isbad << "\n" - << flush; + << " isbad(x)=" << isbad << "\n" << flush; } } - - template<typename A, typename B> - static void check_int(const char* const func, + + template <typename A, typename B> + static void check_int(const char *const func, int_t fstd(typename A::scalar_t x, B y), - intvec_t fvml(A x, B y), - const A x, const B y) - { + intvec_t fvml(A x, B y), const A x, const B y) { intvec_t rstd; - for (int i=0; i<intvec_t::size; ++i) { + for (int i = 0; i < intvec_t::size; ++i) { rstd.set_elt(i, fstd(x[i], y)); } const intvec_t rvml = fvml(x, y); const intvec_t dr = rstd - rvml; const boolvec_t isbad = supported(x) && supported(rstd) && convert_bool(dr); if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " y=" << y << " [" << hex(y) << "]\n" << " fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n" << " fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n" << " error(x,y)=" << dr << " [" << hex(dr) << "]\n" - << " isbad(x,y)=" << isbad << "\n" - << flush; + << " isbad(x,y)=" << isbad << "\n" << flush; } } - - template<typename A, typename B> - static void check_int(const char* const func, + + template <typename A, typename B> + static void check_int(const char *const func, int_t fstd(typename A::scalar_t x, typename B::scalar_t y), - intvec_t fvml(A x, B y), - const A x, const B y) - { + intvec_t fvml(A x, B y), const A x, const B y) { intvec_t rstd; - for (int i=0; i<intvec_t::size; ++i) { + for (int i = 0; i < intvec_t::size; ++i) { rstd.set_elt(i, fstd(x[i], y[i])); } const intvec_t rvml = fvml(x, y); const intvec_t dr = rstd - rvml; const boolvec_t isbad = - supported(x) && supported(y) && supported(rstd) && convert_bool(dr); + supported(x) && supported(y) && supported(rstd) && convert_bool(dr); if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " y=" << y << " [" << hex(y) << "]\n" << " fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n" << " fvml(x,y)=" << rvml << " [" << hex(rvml) << "]\n" << " error(x,y)=" << dr << " [" << hex(dr) << "]\n" - << " isbad(x,y)=" << isbad << "\n" - << flush; + << " isbad(x,y)=" << isbad << "\n" << flush; } } - - template<typename A, typename B, typename C> - static void check_int(const char* const func, - int_t fstd(typename A::scalar_t x, - typename B::scalar_t y, - typename C::scalar_t z), - intvec_t fvml(A x, B y, C z), - const A x, const B y, const C z) - { + + template <typename A, typename B, typename C> + static void + check_int(const char *const func, + int_t fstd(typename A::scalar_t x, typename B::scalar_t y, + typename C::scalar_t z), + intvec_t fvml(A x, B y, C z), const A x, const B y, const C z) { intvec_t rstd; - for (int i=0; i<intvec_t::size; ++i) { + for (int i = 0; i < intvec_t::size; ++i) { rstd.set_elt(i, fstd(x[i], y[i], z[i])); } const intvec_t rvml = fvml(x, y, z); const intvec_t dr = rstd - rvml; - const boolvec_t isbad = - supported(x) && supported(y) && supported(z) && supported(rstd) && - convert_bool(dr); + const boolvec_t isbad = supported(x) && supported(y) && supported(z) && + supported(rstd) && convert_bool(dr); if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " y=" << y << " [" << hex(y) << "]\n" << " z=" << z << " [" << hex(z) << "]\n" << " fstd(x,y,z)=" << rstd << " [" << hex(rstd) << "]\n" << " fvml(x,y,z)=" << rvml << " [" << hex(rvml) << "]\n" << " error(x,y,z)=" << dr << " [" << hex(dr) << "]\n" - << " isbad(x,y,z)=" << isbad << "\n" - << flush; + << " isbad(x,y,z)=" << isbad << "\n" << flush; } } - - static void check_real(const char* const func, - const real_t rstd, const real_t rvml) - { + + static void check_real(const char *const func, const real_t rstd, + const real_t rvml) { const real_t dr = rstd - rvml; const bool isbad = dr != R(0.0); if (isbad) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << "():\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << "():\n" << " fstd()=" << rstd << " [" << hex(rstd) << "]\n" << " fvml()=" << rvml << " [" << hex(rvml) << "]\n" << " error()=" << dr << "\n" - << " isbad()=" << isbad << "\n" - << flush; + << " isbad()=" << isbad << "\n" << flush; } } - - template<typename A> - static void check_real(const char* const func, - const real_t rstd, const real_t rvml, const A x, - const real_t accuracy) - { + + template <typename A> + static void check_real(const char *const func, const real_t rstd, + const real_t rvml, const A x, const real_t accuracy) { const real_t dr = rstd - rvml; real_t maxabs = 0.0; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { maxabs = vml_std::fmax(maxabs, vml_std::fabs(x[i])); } const real_t scale = fabs(rstd) + fabs(rvml) + fabs(maxabs) + R(1.0); const bool isbad = fabs(dr) > accuracy * scale; if (isbad) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << "():\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << "():\n" << " x=" << x << " [" << hex(x) << "]\n" << " fstd(x)=" << rstd << " [" << hex(rstd) << "]\n" << " fvml(x)=" << rvml << " [" << hex(rvml) << "]\n" << " error(x)=" << dr << "\n" - << " isbad(x)=" << isbad << "\n" - << flush; + << " isbad(x)=" << isbad << "\n" << flush; } } - - template<typename A> - static void check_real(const char* const func, - real_t fstd(typename A::scalar_t x), - realvec_t fvml(A x), - const A x, - const real_t accuracy) - { + + template <typename A> + static void + check_real(const char *const func, real_t fstd(typename A::scalar_t x), + realvec_t fvml(A x), const A x, const real_t accuracy) { realvec_t rstd; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { rstd.set_elt(i, fstd(x[i])); } const realvec_t rvml = fvml(x); const realvec_t dr = rstd - rvml; const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0); - const boolvec_t isbad = - supported(x) && supported(rstd) && - fabs(dr) > realvec_t(accuracy) * scale; + const boolvec_t isbad = supported(x) && supported(rstd) && + fabs(dr) > realvec_t(accuracy) * scale; if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " fstd(x)=" << rstd << " [" << hex(rstd) << "]\n" << " fvml(x)=" << rvml << " [" << hex(rvml) << "]\n" << " abs-error(x)=" << fabs(dr) << "\n" << " rel-error(x)=" << fabs(dr) / scale << "\n" << " isbad(x)=" << isbad << "\n" - << " accuracy=" << accuracy << "\n" - << flush; + << " accuracy=" << accuracy << "\n" << flush; } } - - template<typename A, typename B> - static void check_real(const char* const func, + + template <typename A, typename B> + static void check_real(const char *const func, real_t fstd(typename A::scalar_t x, B y), - realvec_t fvml(A x, B y), - const A x, const B y, - const real_t accuracy) - { + realvec_t fvml(A x, B y), const A x, const B y, + const real_t accuracy) { realvec_t rstd; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { rstd.set_elt(i, fstd(x[i], y)); } const realvec_t rvml = fvml(x, y); const realvec_t dr = rstd - rvml; const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0); - const boolvec_t isbad = - supported(x) && supported(rstd) && fabs(dr) > realvec_t(accuracy) * scale; + const boolvec_t isbad = supported(x) && supported(rstd) && + fabs(dr) > realvec_t(accuracy) * scale; if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " y=" << y << " [" << hex(y) << "]\n" << " fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n" @@ -583,38 +500,32 @@ struct vecmathlib_test { << " abs-error(x,y)=" << fabs(dr) << "\n" << " rel-error(x,y)=" << fabs(dr) / scale << "\n" << " isbad(x,y)=" << isbad << "\n" - << " accuracy=" << accuracy << "\n" - << flush; + << " accuracy=" << accuracy << "\n" << flush; } } - - template<typename A, typename B> - static void check_real(const char* const func, - real_t fstd(typename A::scalar_t x, - typename B::scalar_t y), - realvec_t fvml(A x, B y), - const A x, const B y, - const real_t accuracy, - const realvec_t offset = RV(0.0)) - { + + template <typename A, typename B> + static void + check_real(const char *const func, + real_t fstd(typename A::scalar_t x, typename B::scalar_t y), + realvec_t fvml(A x, B y), const A x, const B y, + const real_t accuracy, const realvec_t offset = RV(0.0)) { realvec_t rstd; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { rstd.set_elt(i, fstd(x[i], y[i])); } realvec_t rvml = fvml(x, y); // Fix up rvml by adding/subtracting the offset - rvml = ifthen(fabs(rstd-rvml)>fabs(offset/RV(2.0)), - rvml + copysign(offset, rstd-rvml), - rvml); + rvml = ifthen(fabs(rstd - rvml) > fabs(offset / RV(2.0)), + rvml + copysign(offset, rstd - rvml), rvml); const realvec_t dr = rstd - rvml; const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0); - const boolvec_t isbad = - supported(x) && supported(y) && supported(rstd) && - fabs(dr) > realvec_t(accuracy) * scale; + const boolvec_t isbad = supported(x) && supported(y) && supported(rstd) && + fabs(dr) > realvec_t(accuracy) * scale; if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " y=" << y << " [" << hex(y) << "]\n" << " fstd(x,y)=" << rstd << " [" << hex(rstd) << "]\n" @@ -622,34 +533,31 @@ struct vecmathlib_test { << " abs-error(x,y)=" << fabs(dr) << "\n" << " rel-error(x,y)=" << fabs(dr) / scale << "\n" << " isbad(x,y)=" << isbad << "\n" - << " accuracy=" << accuracy << "\n" - << flush; + << " accuracy=" << accuracy << "\n" << flush; } } - - template<typename A, typename B, typename C> - static void check_real(const char* const func, + + template <typename A, typename B, typename C> + static void check_real(const char *const func, real_t fstd(typename A::scalar_t x, typename B::scalar_t y, typename C::scalar_t z), - realvec_t fvml(A x, B y, C z), - const A x, const B y, C const z, - const real_t accuracy) - { + realvec_t fvml(A x, B y, C z), const A x, const B y, + C const z, const real_t accuracy) { realvec_t rstd; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { rstd.set_elt(i, fstd(x[i], y[i], z[i])); } const realvec_t rvml = fvml(x, y, z); const realvec_t dr = rstd - rvml; const realvec_t scale = fabs(rstd) + fabs(rvml) + realvec_t(1.0); - const boolvec_t isbad = - supported(x) && supported(y) && supported(z) && supported(rstd) && - fabs(dr) > realvec_t(accuracy) * scale; + const boolvec_t isbad = supported(x) && supported(y) && supported(z) && + supported(rstd) && + fabs(dr) > realvec_t(accuracy) * scale; if (any(isbad)) { - ++ num_errors; - cout << setprecision(realvec_t::digits10+2) - << "Error in " << func << ":\n" + ++num_errors; + cout << setprecision(realvec_t::digits10 + 2) << "Error in " << func + << ":\n" << " x=" << x << " [" << hex(x) << "]\n" << " y=" << y << " [" << hex(y) << "]\n" << " z=" << z << " [" << hex(z) << "]\n" @@ -658,61 +566,57 @@ struct vecmathlib_test { << " abs-error(x,y,z)=" << fabs(dr) << "\n" << " rel-error(x,y,z)=" << fabs(dr) / scale << "\n" << " isbad(x,y,z)=" << isbad << "\n" - << " accuracy=" << accuracy << "\n" - << flush; + << " accuracy=" << accuracy << "\n" << flush; } } - - - - static real_t* align_mem(real_t* p) - { + + static real_t *align_mem(real_t *p) { const ptrdiff_t alignment = sizeof(realvec_t); - p = (real_t*)((intptr_t(p) + alignment-1) & -alignment); + p = (real_t *)((intptr_t(p) + alignment - 1) & -alignment); assert(intptr_t(p) % alignment == 0); return p; } - static string add_suffix(const char* str, int i) - { + static string add_suffix(const char *str, int i) { ostringstream buf; buf << str << "." << i; return buf.str(); } - static void test_mem() - { - cout << " testing loada loadu storea storeu (errors may lead to segfaults)...\n" << flush; + static void test_mem() { + cout << " testing loada loadu storea storeu (errors may lead to " + "segfaults)...\n" + << flush; const int n = 4; const int sz = realvec_t::size; - const int nbytes = n*sz*sizeof(real_t); - real_t* const x = align_mem(new real_t[(n+1)*sz]); - real_t* const xnew = align_mem(new real_t[(n+1)*sz]); - for (int i=0; i<n; ++i) { + const int nbytes = n * sz * sizeof(real_t); + real_t *const x = align_mem(new real_t[(n + 1) * sz]); + real_t *const xnew = align_mem(new real_t[(n + 1) * sz]); + for (int i = 0; i < n; ++i) { realvec_t xv = random(R(-10.0), R(+10.0)); - memcpy(&x[i*sz], &xv, sizeof xv); + memcpy(&x[i * sz], &xv, sizeof xv); } const realvec_t z = random(R(-10.0), R(+10.0)); - + // loada { const real_t *p = &x[sz]; realvec_t y = realvec_t::loada(p); check_mem("loada", y, p, z, ~0); } - + // loadu - for (ptrdiff_t i=0; i<realvec_t::size; ++i) { + for (ptrdiff_t i = 0; i < realvec_t::size; ++i) { const real_t *p = &x[sz]; - realvec_t y = realvec_t::loadu(p+i); - check_mem(add_suffix("loadu", i).c_str(), y, p+i, z, ~0); + realvec_t y = realvec_t::loadu(p + i); + check_mem(add_suffix("loadu", i).c_str(), y, p + i, z, ~0); } - + // loadu(ioff) - for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) { + for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) { const real_t *p = &x[sz]; realvec_t y = realvec_t::loadu(p, ioff); - check_mem(add_suffix("loadu(ioff)", ioff).c_str(), y, p+ioff, z, ~0); + check_mem(add_suffix("loadu(ioff)", ioff).c_str(), y, p + ioff, z, ~0); } - + // storea { memcpy(xnew, x, nbytes); @@ -720,50 +624,51 @@ struct vecmathlib_test { storea(z, p); check_mem("storea", p, z, &x[sz], ~0); } - + // storeu - for (ptrdiff_t i=0; i<realvec_t::size; ++i) { + for (ptrdiff_t i = 0; i < realvec_t::size; ++i) { memcpy(xnew, x, nbytes); real_t *p = &xnew[sz]; - storeu(z, p+i); - check_mem(add_suffix("storeu", i).c_str(), p+i, z, &x[sz+i], ~0); + storeu(z, p + i); + check_mem(add_suffix("storeu", i).c_str(), p + i, z, &x[sz + i], ~0); } - + // storeu - for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) { + for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) { memcpy(xnew, x, nbytes); real_t *p = &xnew[sz]; storeu(z, p, ioff); - check_mem(add_suffix("storeu(ioff)", ioff).c_str(), - p+ioff, z, &x[sz+ioff], ~0); + check_mem(add_suffix("storeu(ioff)", ioff).c_str(), p + ioff, z, + &x[sz + ioff], ~0); } - - for (int mval=0; mval<(1<<realvec_t::size); ++mval) { + + for (int mval = 0; mval < (1 << realvec_t::size); ++mval) { boolvec_t mbool; - for (int i=0; i<realvec_t::size; ++i) mbool.set_elt(i, mval & (1<<i)); + for (int i = 0; i < realvec_t::size; ++i) + mbool.set_elt(i, mval & (1 << i)); typename realvec_t::mask_t mask(mbool); - + // loada(mask) { const real_t *p = &x[sz]; realvec_t y = loada(p, z, mask); check_mem("loada(mask)", y, p, z, mval); } - + // loadu(mask) - for (ptrdiff_t i=0; i<realvec_t::size; ++i) { + for (ptrdiff_t i = 0; i < realvec_t::size; ++i) { const real_t *p = &x[sz]; - realvec_t y = loadu(p+i, z, mask); - check_mem("loadu(mask)", y, p+i, z, mval); + realvec_t y = loadu(p + i, z, mask); + check_mem("loadu(mask)", y, p + i, z, mval); } - + // loadu(ioff, mask) - for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) { + for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) { const real_t *p = &x[sz]; realvec_t y = loadu(p, ioff, z, mask); - check_mem("loadu(ioff,mask)", y, p+ioff, z, mval); + check_mem("loadu(ioff,mask)", y, p + ioff, z, mval); } - + // storea { memcpy(xnew, x, nbytes); @@ -771,37 +676,35 @@ struct vecmathlib_test { storea(z, p, mask); check_mem("storea(mask)", p, z, &x[sz], mval); } - + // storeu - for (ptrdiff_t i=0; i<realvec_t::size; ++i) { + for (ptrdiff_t i = 0; i < realvec_t::size; ++i) { memcpy(xnew, x, nbytes); real_t *p = &xnew[sz]; - storeu(z, p+i, mask); - check_mem("storeu(mask)", p+i, z, &x[sz+i], mval); + storeu(z, p + i, mask); + check_mem("storeu(mask)", p + i, z, &x[sz + i], mval); } - + // storeu - for (ptrdiff_t ioff=0; ioff<realvec_t::size; ++ioff) { + for (ptrdiff_t ioff = 0; ioff < realvec_t::size; ++ioff) { memcpy(xnew, x, nbytes); real_t *p = &xnew[sz]; storeu(z, p, ioff, mask); - check_mem("storeu(ioff,mask)", p+ioff, z, &x[sz+ioff], mval); + check_mem("storeu(ioff,mask)", p + ioff, z, &x[sz + ioff], mval); } - + } // for mval } - - - - template<typename T> - static T local_ifthen(bool b, T x, T y) { return b ? x : y; } - static void test_bool() - { + + template <typename T> static T local_ifthen(bool b, T x, T y) { + return b ? x : y; + } + static void test_bool() { cout << " testing boolean operations...\n" << flush; - + const boolvec_t bf = boolvec_t(false); const boolvec_t bt = boolvec_t(true); - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { check_bool("false", false, bf[i]); check_bool("true", true, bt[i]); } @@ -809,32 +712,32 @@ struct vecmathlib_test { check_bool("all", true, all(bt), true); check_bool("any", false, any(bf), false); check_bool("any", true, any(bt), true); - + boolvec_t b0 = bt; boolvec_t b1 = bf; - for (int n=0; n<realvec_t::size; ++n) { + for (int n = 0; n < realvec_t::size; ++n) { b0.set_elt(n, false); b1.set_elt(n, true); - for (int i=0; i<realvec_t::size; ++i) { - check_bool("set_elt", i<=n ? false : true, b0[i], false); - check_bool("set_elt", i<=n ? true : false, b1[i], true); + for (int i = 0; i < realvec_t::size; ++i) { + check_bool("set_elt", i <= n ? false : true, b0[i], false); + check_bool("set_elt", i <= n ? true : false, b1[i], true); } } - - for (int n=0; n<(1<<realvec_t::size); ++n) { + + for (int n = 0; n < (1 << realvec_t::size); ++n) { boolvec_t x; - for (int i=0; i<realvec_t::size; ++i) { - x.set_elt(i, n & (1<<i)); + for (int i = 0; i < realvec_t::size; ++i) { + x.set_elt(i, n & (1 << i)); } - for (int i=0; i<realvec_t::size; ++i) { - bool rstd = n & (1<<i); + for (int i = 0; i < realvec_t::size; ++i) { + bool rstd = n & (1 << i); bool rvml = x[i]; check_bool("[]", rstd, rvml, x); } - + { boolvec_t rstd; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { rstd.set_elt(i, !x[i]); } boolvec_t rvml = !x; @@ -842,7 +745,7 @@ struct vecmathlib_test { } { bool rstd = x[0]; - for (int i=1; i<realvec_t::size; ++i) { + for (int i = 1; i < realvec_t::size; ++i) { rstd &= x[i]; } bool rvml = all(x); @@ -850,39 +753,36 @@ struct vecmathlib_test { } { bool rstd = x[0]; - for (int i=1; i<realvec_t::size; ++i) { + for (int i = 1; i < realvec_t::size; ++i) { rstd |= x[i]; } bool rvml = any(x); check_bool("any", rstd, rvml, x); } - check_bool - ("ifthen(bool)", - local_ifthen<bool>, - (boolvec_t(*)(boolvec_t,boolvec_t,boolvec_t))vecmathlib::ifthen, - x, BV(false), BV(true)); - check_int("ifthen(int)", - local_ifthen<int_t>, - (intvec_t(*)(boolvec_t,intvec_t,intvec_t))vecmathlib::ifthen, + check_bool( + "ifthen(bool)", local_ifthen<bool>, + (boolvec_t (*)(boolvec_t, boolvec_t, boolvec_t))vecmathlib::ifthen, x, + BV(false), BV(true)); + check_int("ifthen(int)", local_ifthen<int_t>, + (intvec_t (*)(boolvec_t, intvec_t, intvec_t))vecmathlib::ifthen, x, IV(I(1)), IV(I(2))); - check_real("ifthen(real)", - local_ifthen<real_t>, - ((realvec_t(*)(boolvec_t,realvec_t,realvec_t)) - vecmathlib::ifthen), - x, RV(1.0), RV(2.0), R(0.0)); - } - - for (int n=0; n<(1<<realvec_t::size); ++n) { - for (int m=0; m<(1<<realvec_t::size); ++m) { + check_real( + "ifthen(real)", local_ifthen<real_t>, + ((realvec_t (*)(boolvec_t, realvec_t, realvec_t))vecmathlib::ifthen), + x, RV(1.0), RV(2.0), R(0.0)); + } + + for (int n = 0; n < (1 << realvec_t::size); ++n) { + for (int m = 0; m < (1 << realvec_t::size); ++m) { boolvec_t x, y; - for (int i=0; i<realvec_t::size; ++i) { - x.set_elt(i, n & (1<<i)); - y.set_elt(i, m & (1<<i)); + for (int i = 0; i < realvec_t::size; ++i) { + x.set_elt(i, n & (1 << i)); + y.set_elt(i, m & (1 << i)); } - + { boolvec_t rstd; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { rstd.set_elt(i, x[i] && y[i]); } boolvec_t rvml = x && y; @@ -890,7 +790,7 @@ struct vecmathlib_test { } { boolvec_t rstd; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { rstd.set_elt(i, x[i] || y[i]); } boolvec_t rvml = x || y; @@ -898,7 +798,7 @@ struct vecmathlib_test { } { boolvec_t rstd; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { rstd.set_elt(i, x[i] == y[i]); } boolvec_t rvml = x == y; @@ -906,7 +806,7 @@ struct vecmathlib_test { } { boolvec_t rstd; - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { rstd.set_elt(i, x[i] != y[i]); } boolvec_t rvml = x != y; @@ -915,322 +815,374 @@ struct vecmathlib_test { } } } - - - + static bool local_convert_bool(int_t x) { return x; } static int_t local_convert_int(bool x) { return x; } - template<typename T> static T local_pos(T x) { return +x; } - template<typename T> static T local_neg(T x) { return -x; } - template<typename T> static T local_not(T x) { return ~x; } - template<typename T> static T local_add(T x, T y) { return x+y; } - template<typename T> static T local_sub(T x, T y) { return x-y; } - template<typename T> static T local_mul(T x, T y) { return x*y; } - template<typename T> static T local_div(T x, T y) { return x/y; } - template<typename T> static T local_mod(T x, T y) { return x%y; } - template<typename T> static T local_and(T x, T y) { return x&y; } - template<typename T> static T local_or(T x, T y) { return x|y; } - template<typename T> static T local_xor(T x, T y) { return x^y; } - - static int_t local_lsr(int_t x, int_t y) { return uint_t(x)>>uint_t(y); } - template<typename T> static T local_srs(T x, typename T::scalar_t y) - { - return x>>y; + template <typename T> static T local_pos(T x) { return +x; } + template <typename T> static T local_neg(T x) { return -x; } + template <typename T> static T local_not(T x) { return ~x; } + template <typename T> static T local_add(T x, T y) { return x + y; } + template <typename T> static T local_sub(T x, T y) { return x - y; } + template <typename T> static T local_mul(T x, T y) { return x * y; } + template <typename T> static T local_div(T x, T y) { return x / y; } + template <typename T> static T local_mod(T x, T y) { return x % y; } + template <typename T> static T local_and(T x, T y) { return x & y; } + template <typename T> static T local_or(T x, T y) { return x | y; } + template <typename T> static T local_xor(T x, T y) { return x ^ y; } + + static int_t local_lsr(int_t x, int_t y) { return uint_t(x) >> uint_t(y); } + template <typename T> static T local_srs(T x, typename T::scalar_t y) { + return x >> y; } - template<typename T> static T local_sls(T x, typename T::scalar_t y) - { - return x<<y; + template <typename T> static T local_sls(T x, typename T::scalar_t y) { + return x << y; } - template<typename T> static T local_sr(T x, T y) { return x>>y; } - template<typename T> static T local_sl(T x, T y) { return x<<y; } - - template<typename T> static bool local_isignbit(T x) { return x<0; } - template<typename T> static bool local_eq(T x, T y) { return x==y; } - template<typename T> static bool local_ne(T x, T y) { return x!=y; } - template<typename T> static bool local_lt(T x, T y) { return x<y; } - template<typename T> static bool local_le(T x, T y) { return x<=y; } - template<typename T> static bool local_gt(T x, T y) { return x>y; } - template<typename T> static bool local_ge(T x, T y) { return x>=y; } - template<typename T> static boolvec_t local_veq(T x, T y) { return x==y; } - template<typename T> static boolvec_t local_vne(T x, T y) { return x!=y; } - template<typename T> static boolvec_t local_vlt(T x, T y) { return x<y; } - template<typename T> static boolvec_t local_vle(T x, T y) { return x<=y; } - template<typename T> static boolvec_t local_vgt(T x, T y) { return x>y; } - template<typename T> static boolvec_t local_vge(T x, T y) { return x>=y; } - static void test_int() - { + template <typename T> static T local_sr(T x, T y) { return x >> y; } + template <typename T> static T local_sl(T x, T y) { return x << y; } + + template <typename T> static bool local_isignbit(T x) { return x < 0; } + template <typename T> static bool local_eq(T x, T y) { return x == y; } + template <typename T> static bool local_ne(T x, T y) { return x != y; } + template <typename T> static bool local_lt(T x, T y) { return x < y; } + template <typename T> static bool local_le(T x, T y) { return x <= y; } + template <typename T> static bool local_gt(T x, T y) { return x > y; } + template <typename T> static bool local_ge(T x, T y) { return x >= y; } + template <typename T> static boolvec_t local_veq(T x, T y) { return x == y; } + template <typename T> static boolvec_t local_vne(T x, T y) { return x != y; } + template <typename T> static boolvec_t local_vlt(T x, T y) { return x < y; } + template <typename T> static boolvec_t local_vle(T x, T y) { return x <= y; } + template <typename T> static boolvec_t local_vgt(T x, T y) { return x > y; } + template <typename T> static boolvec_t local_vge(T x, T y) { return x >= y; } + static void test_int() { cout << " testing integer operations...\n" << flush; - + intvec_t i0 = intvec_t(I(0)); intvec_t i1 = intvec_t(I(1)); intvec_t iiota = intvec_t::iota(); - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { check_int("0", 0, i0[i]); check_int("1", 1, i1[i]); check_int("iota", i, iiota[i]); } - + i0 = intvec_t(I(1)); i1 = intvec_t(I(0)); - for (int n=0; n<realvec_t::size; ++n) { + for (int n = 0; n < realvec_t::size; ++n) { i0.set_elt(n, 0); i1.set_elt(n, 1); - for (int i=0; i<realvec_t::size; ++i) { - check_bool("set_elt", i<=n ? 0 : 1, i0[i], 0); - check_bool("set_elt", i<=n ? 1 : 0, i1[i], 1); + for (int i = 0; i < realvec_t::size; ++i) { + check_bool("set_elt", i <= n ? 0 : 1, i0[i], 0); + check_bool("set_elt", i <= n ? 1 : 0, i1[i], 1); } } - + const int_t int_min = std::numeric_limits<int_t>::min(); const int_t int_max = std::numeric_limits<int_t>::max(); const int_t values[] = { - 0, 1, 2, 3, -1, -2, -3, - int_min, int_min+1, int_min+2, int_min+3, - int_max, int_max-1, int_max-2, int_max-3, + 0, 1, 2, 3, -1, + -2, -3, int_min, int_min + 1, int_min + 2, + int_min + 3, int_max, int_max - 1, int_max - 2, int_max - 3, }; const int nvalues = sizeof values / sizeof *values; - for (int i=0; i<nvalues*nvalues+2*imax; ++i) { + for (int i = 0; i < nvalues * nvalues + 2 * imax; ++i) { intvec_t x, y; - if (i<nvalues*nvalues) { - x = values[i%nvalues]; - y = values[i/nvalues]; - } else if (i<nvalues*nvalues+imax) { + if (i < nvalues * nvalues) { + x = values[i % nvalues]; + y = values[i / nvalues]; + } else if (i < nvalues * nvalues + imax) { x = random(I(-100), I(+100)); y = random(I(-100), I(+100)); } else { - x = random(int_min/2, int_max/2); - y = random(int_min/2, int_max/2); + x = random(int_min / 2, int_max / 2); + y = random(int_min / 2, int_max / 2); } boolvec_t b = convert_bool(random(I(0), I(1))); - - check_bool<IV>("convert_bool(int)", - local_convert_bool, vecmathlib::convert_bool, x); - check_int<BV>("convert_int(bool)", - local_convert_int, vecmathlib::convert_int, b); - + + check_bool<IV>("convert_bool(int)", local_convert_bool, + vecmathlib::convert_bool, x); + check_int<BV>("convert_int(bool)", local_convert_int, + vecmathlib::convert_int, b); + check_int<IV>("+", local_pos, local_pos, x); check_int<IV>("-", local_neg, local_neg, x); check_int<IV>("~", local_not, local_not, x); - - check_int<IV,IV>("+", local_add, local_add, x, y); - check_int<IV,IV>("-", local_sub, local_sub, x, y); - check_int<IV,IV>("&", local_and, local_and, x, y); - check_int<IV,IV>("|", local_or, local_or, x, y); - check_int<IV,IV>("^", local_xor, local_xor, x, y); - - const int_t bits = 8*sizeof(int_t); - check_int<IV,I>("lsr", local_lsr, vecmathlib::lsr, x, y[0] & (bits-1)); - check_int<IV,I>(">>", local_sr, local_srs, x, y[0] & (bits-1)); - check_int<IV,I>("<<", local_sl, local_sls, x, y[0] & (bits-1)); - check_int<IV,IV>("lsr", local_lsr, vecmathlib::lsr, x, y & IV(bits-1)); - check_int<IV,IV>(">>", local_sr, local_sr, x, y & IV(bits-1)); - check_int<IV,IV>("<<", local_sl, local_sl, x, y & IV(bits-1)); - + + check_int<IV, IV>("+", local_add, local_add, x, y); + check_int<IV, IV>("-", local_sub, local_sub, x, y); + check_int<IV, IV>("&", local_and, local_and, x, y); + check_int<IV, IV>("|", local_or, local_or, x, y); + check_int<IV, IV>("^", local_xor, local_xor, x, y); + + const int_t bits = 8 * sizeof(int_t); + check_int<IV, I>("lsr", local_lsr, vecmathlib::lsr, x, y[0] & (bits - 1)); + check_int<IV, I>(">>", local_sr, local_srs, x, y[0] & (bits - 1)); + check_int<IV, I>("<<", local_sl, local_sls, x, y[0] & (bits - 1)); + check_int<IV, IV>("lsr", local_lsr, vecmathlib::lsr, x, y & IV(bits - 1)); + check_int<IV, IV>(">>", local_sr, local_sr, x, y & IV(bits - 1)); + check_int<IV, IV>("<<", local_sl, local_sl, x, y & IV(bits - 1)); + check_bool<IV>("isignbit", local_isignbit, vecmathlib::isignbit, x); - check_bool<IV,IV>("==", local_eq, local_veq, x, y); - check_bool<IV,IV>("!=", local_ne, local_vne, x, y); - check_bool<IV,IV>("<", local_lt, local_vlt, x, y); - check_bool<IV,IV>("<=", local_le, local_vle, x, y); - check_bool<IV,IV>(">", local_gt, local_vgt, x, y); - check_bool<IV,IV>(">=", local_ge, local_vge, x, y); + check_bool<IV, IV>("==", local_eq, local_veq, x, y); + check_bool<IV, IV>("!=", local_ne, local_vne, x, y); + check_bool<IV, IV>("<", local_lt, local_vlt, x, y); + check_bool<IV, IV>("<=", local_le, local_vle, x, y); + check_bool<IV, IV>(">", local_gt, local_vgt, x, y); + check_bool<IV, IV>(">=", local_ge, local_vge, x, y); } } - - static void test_real() - { + + static void test_real() { cout << " testing real operations...\n" << flush; - + realvec_t r0 = realvec_t(0.0); realvec_t r1 = realvec_t(1.0); - for (int i=0; i<realvec_t::size; ++i) { + for (int i = 0; i < realvec_t::size; ++i) { check_real("0.0", R(0.0), r0[i]); check_real("1.0", R(1.0), r1[i]); } - + r0 = realvec_t(1.0); r1 = realvec_t(0.0); - for (int n=0; n<realvec_t::size; ++n) { + for (int n = 0; n < realvec_t::size; ++n) { r0.set_elt(n, R(0.0)); r1.set_elt(n, R(1.0)); - for (int i=0; i<realvec_t::size; ++i) { - check_bool("set_elt", i<=n ? R(0.0) : R(1.0), r0[i], R(0.0)); - check_bool("set_elt", i<=n ? R(1.0) : R(0.0), r1[i], R(1.0)); + for (int i = 0; i < realvec_t::size; ++i) { + check_bool("set_elt", i <= n ? R(0.0) : R(1.0), r0[i], R(0.0)); + check_bool("set_elt", i <= n ? R(1.0) : R(0.0), r1[i], R(1.0)); } } - + // barrier realvec_t rcancel = r1; rcancel += RV(R(FP::max() / 2)); rcancel.barrier(); rcancel -= RV(R(FP::max() / 2)); check_real("barrier", R(0.0), rcancel[0]); - + // rounding (break ties to even, or break ties away from zero?) realvec_t rbase = RV(R(1.0)); - rbase += RV(FP::epsilon()/2); + rbase += RV(FP::epsilon() / 2); check_real("flt_rounds", R(1.0), rbase[0]); rbase = RV(R(1.0) + FP::epsilon()); - rbase += RV(FP::epsilon()/2); - check_real("flt_rounds", R(1.0) + 2*FP::epsilon(), rbase[0]); + rbase += RV(FP::epsilon() / 2); + check_real("flt_rounds", R(1.0) + 2 * FP::epsilon(), rbase[0]); } - - static int_t local_bitifthen(int_t x, int_t y, int_t z) - { + + static int_t local_bitifthen(int_t x, int_t y, int_t z) { return (x & y) | (~x & z); } - static int_t local_clz(int_t x) - { + static int_t local_clz(int_t x) { int bits = CHAR_BIT * sizeof(x); int res = 0; - for (; res<bits; ++res) { - if (x & (I(1) << (bits-res-1))) break; + for (; res < bits; ++res) { + if (x & (I(1) << (bits - res - 1))) + break; } return res; } - static int_t local_max(int_t x, int_t y) - { - return std::max(x, y); - } - static int_t local_min(int_t x, int_t y) - { - return std::min(x, y); - } - static int_t local_popcount(int_t x) - { + static int_t local_max(int_t x, int_t y) { return std::max(x, y); } + static int_t local_min(int_t x, int_t y) { return std::min(x, y); } + static int_t local_popcount(int_t x) { int bits = CHAR_BIT * sizeof(x); int res = 0; - for (int d=0; d<bits; ++d) { - if (x & (I(1) << d)) ++res; + for (int d = 0; d < bits; ++d) { + if (x & (I(1) << d)) + ++res; } return res; } - static int_t local_rotate(int_t x, int_t n) - { + static int_t local_rotate(int_t x, int_t n) { int_t mask = CHAR_BIT * sizeof(int_t) - 1; int_t left = x << (n & mask); int_t right = I(U(x) >> U(-n & mask)); return left | right; } - static void test_abs() - { - cout << " testing abs bitifthen clz isignbit max min popcount rotate...\n" << flush; - - for (int i=0; i<imax; ++i) { + static void test_abs() { + cout << " testing abs bitifthen clz isignbit max min popcount rotate...\n" + << flush; + + for (int i = 0; i < imax; ++i) { const intvec_t x = random(I(-1000000), I(+1000000)); const intvec_t y = random(I(-1000000), I(+1000000)); const intvec_t z = random(I(-1000000), I(+1000000)); - + check_int<IV>("abs", std::abs, vecmathlib::abs, x); - check_int<IV,IV,IV>("bitifthen", - local_bitifthen, vecmathlib::bitifthen, x, y, z); + check_int<IV, IV, IV>("bitifthen", local_bitifthen, vecmathlib::bitifthen, + x, y, z); check_int<IV>("clz", local_clz, vecmathlib::clz, x); - check_int<IV,IV>("max", local_max, vecmathlib::max, x, y); - check_int<IV,IV>("min", local_min, vecmathlib::min, x, y); + check_int<IV, IV>("max", local_max, vecmathlib::max, x, y); + check_int<IV, IV>("min", local_min, vecmathlib::min, x, y); check_int<IV>("popcount", local_popcount, vecmathlib::popcount, x); - check_int<IV,IV>("rotate", local_rotate, vecmathlib::rotate, x, y[0]); - check_int<IV,IV>("rotate", local_rotate, vecmathlib::rotate, x, y); + check_int<IV, IV>("rotate", local_rotate, vecmathlib::rotate, x, y[0]); + check_int<IV, IV>("rotate", local_rotate, vecmathlib::rotate, x, y); } } - + // Change signature: "int" -> "int_t" - static real_t local_frexp0(real_t x) - { + static real_t local_frexp0(real_t x) { int r; return vml_std::frexp(x, &r); } - static int_t local_frexp1(real_t x) - { - if (vml_std::isinf(x)) return std::numeric_limits<int_t>::max(); - if (vml_std::isnan(x)) return std::numeric_limits<int_t>::min(); + static int_t local_frexp1(real_t x) { + if (vml_std::isinf(x)) + return std::numeric_limits<int_t>::max(); + if (vml_std::isnan(x)) + return std::numeric_limits<int_t>::min(); int r; vml_std::frexp(x, &r); return r; } - static realvec_t local_vfrexp0(realvec_t x) - { + static realvec_t local_vfrexp0(realvec_t x) { intvec_t r; return vecmathlib::frexp(x, &r); } - static intvec_t local_vfrexp1(realvec_t x) - { + static intvec_t local_vfrexp1(realvec_t x) { intvec_t r; vecmathlib::frexp(x, &r); return r; } - static int_t local_ilogb(real_t x) - { - if (x==R(0.0)) return std::numeric_limits<int_t>::min(); - if (vml_std::isinf(x)) return std::numeric_limits<int_t>::max(); - if (vml_std::isnan(x)) return std::numeric_limits<int_t>::min(); + static int_t local_ilogb(real_t x) { + if (x == R(0.0)) + return std::numeric_limits<int_t>::min(); + if (vml_std::isinf(x)) + return std::numeric_limits<int_t>::max(); + if (vml_std::isnan(x)) + return std::numeric_limits<int_t>::min(); return vml_std::ilogb(x); } static real_t local_ldexp(real_t x, int_t n) { return ldexp(x, n); } - static real_t local_mad(real_t x, real_t y, real_t z) { return x*y+z; } - static void test_fabs() - { - cout << " testing + - + - * == != < <= > >= copysign fabs fdim fma fmax fmin frexp ilogb isfinite isinf isnan isnormal ldexp mad nextafter signbit...\n" << flush; - + static real_t local_mad(real_t x, real_t y, real_t z) { return x * y + z; } + static void test_fabs() { + cout << " testing + - + - * == != < <= > >= copysign fabs fdim fma fmax " + "fmin frexp ilogb isfinite isinf isnan isnormal ldexp mad " + "nextafter signbit...\n" + << flush; + const real_t eps = FP::epsilon(); const real_t int_min = R(std::numeric_limits<int_t>::min()); const real_t int_max = R(std::numeric_limits<int_t>::max()); const real_t uint_min = R(std::numeric_limits<uint_t>::min()); const real_t uint_max = R(std::numeric_limits<uint_t>::max()); const real_t values[] = { - R(+0.0), R(+0.1), R(+0.9), R(+1.0), R(+1.1), - R(-0.0), R(-0.1), R(-0.9), R(-1.0), R(-1.1), - R(+0.0)+eps, R(+0.1)+eps, R(+0.9)+eps, R(+1.0)+eps, R(+1.1)+eps, - R(-0.0)+eps, R(-0.1)+eps, R(-0.9)+eps, R(-1.0)+eps, R(-1.1)+eps, - R(+0.0)-eps, R(+0.1)-eps, R(+0.9)-eps, R(+1.0)-eps, R(+1.1)-eps, - R(-0.0)-eps, R(-0.1)-eps, R(-0.9)-eps, R(-1.0)-eps, R(-1.1)-eps, + R(+0.0), + R(+0.1), + R(+0.9), + R(+1.0), + R(+1.1), + R(-0.0), + R(-0.1), + R(-0.9), + R(-1.0), + R(-1.1), + R(+0.0) + eps, + R(+0.1) + eps, + R(+0.9) + eps, + R(+1.0) + eps, + R(+1.1) + eps, + R(-0.0) + eps, + R(-0.1) + eps, + R(-0.9) + eps, + R(-1.0) + eps, + R(-1.1) + eps, + R(+0.0) - eps, + R(+0.1) - eps, + R(+0.9) - eps, + R(+1.0) - eps, + R(+1.1) - eps, + R(-0.0) - eps, + R(-0.1) - eps, + R(-0.9) - eps, + R(-1.0) - eps, + R(-1.1) - eps, #ifdef VML_HAVE_DENORMALS - +FP::min(), +FP::min()*(R(1.0)+eps), +FP::min()*R(2.0), - -FP::min(), -FP::min()*(R(1.0)+eps), -FP::min()*R(2.0), + +FP::min(), + +FP::min() * (R(1.0) + eps), + +FP::min() * R(2.0), + -FP::min(), + -FP::min() * (R(1.0) + eps), + -FP::min() * R(2.0), #endif - +FP::max(), +FP::max()*(R(1.0)-eps), +FP::max()*(R(1.0)-R(2.0)*eps), - -FP::max(), -FP::max()*(R(1.0)-eps), -FP::max()*(R(1.0)-R(2.0)*eps), - +R(0.5)*FP::max(), +R(0.5)*FP::max()*(R(1.0)+eps), - -R(0.5)*FP::max(), -R(0.5)*FP::max()*(R(1.0)+eps), + +FP::max(), + +FP::max() * (R(1.0) - eps), + +FP::max() * (R(1.0) - R(2.0) * eps), + -FP::max(), + -FP::max() * (R(1.0) - eps), + -FP::max() * (R(1.0) - R(2.0) * eps), + +R(0.5) * FP::max(), + +R(0.5) * FP::max() * (R(1.0) + eps), + -R(0.5) * FP::max(), + -R(0.5) * FP::max() * (R(1.0) + eps), #ifdef VML_HAVE_INF - +R(1.0/0.0), // +FP::infinity() - -R(1.0/0.0), // -FP::infinity() + +R(1.0 / 0.0), // +FP::infinity() + -R(1.0 / 0.0), // -FP::infinity() #endif #ifdef VML_HAVE_NAN - R(0.0/0.0), // FP::quiet_NaN() + R(0.0 / 0.0), // FP::quiet_NaN() #endif - +int_min, +int_max, +uint_min, +uint_max, - -int_min, -int_max, -uint_min, -uint_max, - +int_min+R(0.1), +int_max+R(0.1), +uint_min+R(0.1), +uint_max+R(0.1), - -int_min+R(0.1), -int_max+R(0.1), -uint_min+R(0.1), -uint_max+R(0.1), - +int_min-R(0.1), +int_max-R(0.1), +uint_min-R(0.1), +uint_max-R(0.1), - -int_min-R(0.1), -int_max-R(0.1), -uint_min-R(0.1), -uint_max-R(0.1), - +int_min+R(1.0), +int_max+R(1.0), +uint_min+R(1.0), +uint_max+R(1.0), - -int_min+R(1.0), -int_max+R(1.0), -uint_min+R(1.0), -uint_max+R(1.0), - +int_min-R(1.0), +int_max-R(1.0), +uint_min-R(1.0), +uint_max-R(1.0), - -int_min-R(1.0), -int_max-R(1.0), -uint_min-R(1.0), -uint_max-R(1.0), - -R(443.9999425), + +int_min, + +int_max, + +uint_min, + +uint_max, + -int_min, + -int_max, + -uint_min, + -uint_max, + +int_min + R(0.1), + +int_max + R(0.1), + +uint_min + R(0.1), + +uint_max + R(0.1), + -int_min + R(0.1), + -int_max + R(0.1), + -uint_min + R(0.1), + -uint_max + R(0.1), + +int_min - R(0.1), + +int_max - R(0.1), + +uint_min - R(0.1), + +uint_max - R(0.1), + -int_min - R(0.1), + -int_max - R(0.1), + -uint_min - R(0.1), + -uint_max - R(0.1), + +int_min + R(1.0), + +int_max + R(1.0), + +uint_min + R(1.0), + +uint_max + R(1.0), + -int_min + R(1.0), + -int_max + R(1.0), + -uint_min + R(1.0), + -uint_max + R(1.0), + +int_min - R(1.0), + +int_max - R(1.0), + +uint_min - R(1.0), + +uint_max - R(1.0), + -int_min - R(1.0), + -int_max - R(1.0), + -uint_min - R(1.0), + -uint_max - R(1.0), + -R(443.9999425), }; const int nvalues = sizeof values / sizeof *values; - - for (int i=0; i<8*nvalues+imax; ++i) { - const realvec_t x = - i<8*nvalues && i&1 ? RV(values[i/8]) : random(R(-10.0), R(+10.0)); - const realvec_t y = - i<8*nvalues && i&2 ? RV(values[i/8]) : random(R(-10.0), R(+10.0)); - const realvec_t z = - i<8*nvalues && i&4 ? RV(values[i/8]) : random(R(-10.0), R(+10.0)); + + for (int i = 0; i < 8 * nvalues + imax; ++i) { + const realvec_t x = i < 8 * nvalues && i & 1 ? RV(values[i / 8]) + : random(R(-10.0), R(+10.0)); + const realvec_t y = i < 8 * nvalues && i & 2 ? RV(values[i / 8]) + : random(R(-10.0), R(+10.0)); + const realvec_t z = i < 8 * nvalues && i & 4 ? RV(values[i / 8]) + : random(R(-10.0), R(+10.0)); const intvec_t n = random(int_t(-10), int_t(+10)); - + check_real<RV>("+", local_pos, local_pos, x, R(0.0)); check_real<RV>("-", local_neg, local_neg, x, R(0.0)); - - check_real<RV,RV>("+", local_add, local_add, x, y, R(0.0)); - check_real<RV,RV>("-", local_sub, local_sub, x, y, R(0.0)); - check_real<RV,RV>("*", local_mul, local_mul, x, y, R(0.0)); - + + check_real<RV, RV>("+", local_add, local_add, x, y, R(0.0)); + check_real<RV, RV>("-", local_sub, local_sub, x, y, R(0.0)); + check_real<RV, RV>("*", local_mul, local_mul, x, y, R(0.0)); + { real_t rstd = x[0]; - for (int i=1; i<realvec_t::size; ++i) { + for (int i = 1; i < realvec_t::size; ++i) { rstd += x[i]; } real_t rvml = sum(x); @@ -1238,7 +1190,7 @@ struct vecmathlib_test { } { real_t rstd = x[0]; - for (int i=1; i<realvec_t::size; ++i) { + for (int i = 1; i < realvec_t::size; ++i) { rstd *= x[i]; } real_t rvml = prod(x); @@ -1246,7 +1198,7 @@ struct vecmathlib_test { } { real_t rstd = x[0]; - for (int i=1; i<realvec_t::size; ++i) { + for (int i = 1; i < realvec_t::size; ++i) { rstd = vml_std::fmax(rstd, x[i]); } real_t rvml = vecmathlib::maxval(x); @@ -1254,34 +1206,33 @@ struct vecmathlib_test { } { real_t rstd = x[0]; - for (int i=1; i<realvec_t::size; ++i) { + for (int i = 1; i < realvec_t::size; ++i) { rstd = vml_std::fmin(rstd, x[i]); } real_t rvml = vecmathlib::minval(x); check_real("minval", rstd, rvml, x, R(0.0)); } - - check_bool<RV,RV>("==", local_eq, local_veq, x, y); - check_bool<RV,RV>("!=", local_ne, local_vne, x, y); - check_bool<RV,RV>("<", local_lt, local_vlt, x, y); - check_bool<RV,RV>("<=", local_le, local_vle, x, y); - check_bool<RV,RV>(">", local_gt, local_vgt, x, y); - check_bool<RV,RV>(">=", local_ge, local_vge, x, y); - - check_real<RV,RV>("copysign", - vml_std::copysign, vecmathlib::copysign, x, y, 0.0); + + check_bool<RV, RV>("==", local_eq, local_veq, x, y); + check_bool<RV, RV>("!=", local_ne, local_vne, x, y); + check_bool<RV, RV>("<", local_lt, local_vlt, x, y); + check_bool<RV, RV>("<=", local_le, local_vle, x, y); + check_bool<RV, RV>(">", local_gt, local_vgt, x, y); + check_bool<RV, RV>(">=", local_ge, local_vge, x, y); + + check_real<RV, RV>("copysign", vml_std::copysign, vecmathlib::copysign, x, + y, 0.0); check_real<RV>("fabs", vml_std::fabs, vecmathlib::fabs, x, 0.0); - check_real<RV,RV>("fdim", - vml_std::fdim, vecmathlib::fdim, x, y, accuracy()); - check_real<RV,RV,RV>("fma", - vml_std::fma, vecmathlib::fma, - x, y, z, R(10.0)*accuracy()); - check_real<RV,RV>("fmax", vml_std::fmax, vecmathlib::fmax, x, y, 0.0); - check_real<RV,RV>("fmin", vml_std::fmin, vecmathlib::fmin, x, y, 0.0); + check_real<RV, RV>("fdim", vml_std::fdim, vecmathlib::fdim, x, y, + accuracy()); + check_real<RV, RV, RV>("fma", vml_std::fma, vecmathlib::fma, x, y, z, + R(10.0) * accuracy()); + check_real<RV, RV>("fmax", vml_std::fmax, vecmathlib::fmax, x, y, 0.0); + check_real<RV, RV>("fmin", vml_std::fmin, vecmathlib::fmin, x, y, 0.0); check_real<RV>("frexp0", local_frexp0, local_vfrexp0, x, 0.0); check_int<RV>("frexp1", local_frexp1, local_vfrexp1, x); - check_int<RV>("ilogb", - local_ilogb, (intvec_t(*)(realvec_t))vecmathlib::ilogb, x); + check_int<RV>("ilogb", local_ilogb, + (intvec_t (*)(realvec_t))vecmathlib::ilogb, x); #if defined VML_HAVE_INF || defined VML_HAVE_NAN check_bool<RV>("isfinite", vml_std::isfinite, vecmathlib::isfinite, x); #endif @@ -1294,91 +1245,162 @@ struct vecmathlib_test { #ifdef VML_HAVE_DENORMALS check_bool<RV>("isnormal", vml_std::isnormal, vecmathlib::isnormal, x); #endif - check_real<RV,I>("ldexp", local_ldexp, vecmathlib::ldexp, x, n[0], 0.0); - check_real<RV,IV>("ldexp", local_ldexp, vecmathlib::ldexp, x, n, 0.0); - check_real<RV,RV,RV>("mad", - local_mad, vecmathlib::mad, - x, y, z, R(10.0)*accuracy()); - check_real<RV,RV>("nextafter", - vml_std::nextafter, vecmathlib::nextafter, x, y, 0.0); + check_real<RV, I>("ldexp", local_ldexp, vecmathlib::ldexp, x, n[0], 0.0); + check_real<RV, IV>("ldexp", local_ldexp, vecmathlib::ldexp, x, n, 0.0); + check_real<RV, RV, RV>("mad", local_mad, vecmathlib::mad, x, y, z, + R(10.0) * accuracy()); + check_real<RV, RV>("nextafter", vml_std::nextafter, vecmathlib::nextafter, + x, y, 0.0); check_bool<RV>("signbit", vml_std::signbit, vecmathlib::signbit, x); } } - - static void test_convert() - { - cout << " testing ceil convert_float convert_int floor rint round trunc...\n" + + static void test_convert() { + cout << " testing ceil convert_float convert_int floor rint round " + "trunc...\n" << flush; - + const real_t eps = FP::epsilon(); const real_t int_min = R(std::numeric_limits<int_t>::min()); const real_t int_max = R(std::numeric_limits<int_t>::max()); const real_t uint_min = R(std::numeric_limits<uint_t>::min()); const real_t uint_max = R(std::numeric_limits<uint_t>::max()); - const real_t mantissa_max = (U(1) << (FP::mantissa_bits+1)) - U(1); - const real_t real_max = - (((U(1) << (FP::mantissa_bits+1)) - U(1)) << (FP::exponent_bits-1)) + - (U(1) << (FP::exponent_bits-1)) - U(1); + const real_t mantissa_max = (U(1) << (FP::mantissa_bits + 1)) - U(1); + const real_t real_max = (((U(1) << (FP::mantissa_bits + 1)) - U(1)) + << (FP::exponent_bits - 1)) + + (U(1) << (FP::exponent_bits - 1)) - U(1); const real_t values[] = { - R(+0.0), R(+0.1), R(+0.9), R(+1.0), R(+1.1), - R(-0.0), R(-0.1), R(-0.9), R(-1.0), R(-1.1), - R(+0.0)+eps, R(+0.1)+eps, R(+0.9)+eps, R(+1.0)+eps, R(+1.1)+eps, - R(-0.0)+eps, R(-0.1)+eps, R(-0.9)+eps, R(-1.0)+eps, R(-1.1)+eps, - R(+0.0)-eps, R(+0.1)-eps, R(+0.9)-eps, R(+1.0)-eps, R(+1.1)-eps, - R(-0.0)-eps, R(-0.1)-eps, R(-0.9)-eps, R(-1.0)-eps, R(-1.1)-eps, + R(+0.0), + R(+0.1), + R(+0.9), + R(+1.0), + R(+1.1), + R(-0.0), + R(-0.1), + R(-0.9), + R(-1.0), + R(-1.1), + R(+0.0) + eps, + R(+0.1) + eps, + R(+0.9) + eps, + R(+1.0) + eps, + R(+1.1) + eps, + R(-0.0) + eps, + R(-0.1) + eps, + R(-0.9) + eps, + R(-1.0) + eps, + R(-1.1) + eps, + R(+0.0) - eps, + R(+0.1) - eps, + R(+0.9) - eps, + R(+1.0) - eps, + R(+1.1) - eps, + R(-0.0) - eps, + R(-0.1) - eps, + R(-0.9) - eps, + R(-1.0) - eps, + R(-1.1) - eps, #ifdef VML_HAVE_DENORMALS - +FP::min(), +FP::min()*(R(1.0)+eps), +FP::min()*R(2.0), - -FP::min(), -FP::min()*(R(1.0)+eps), -FP::min()*R(2.0), + +FP::min(), + +FP::min() * (R(1.0) + eps), + +FP::min() * R(2.0), + -FP::min(), + -FP::min() * (R(1.0) + eps), + -FP::min() * R(2.0), #endif - +FP::max(), +FP::max()*(R(1.0)-eps), +FP::max()*(R(1.0)-R(2.0)*eps), - -FP::max(), -FP::max()*(R(1.0)-eps), -FP::max()*(R(1.0)-R(2.0)*eps), - +R(0.5)*FP::max(), +R(0.5)*FP::max()*(R(1.0)+eps), - -R(0.5)*FP::max(), -R(0.5)*FP::max()*(R(1.0)+eps), + +FP::max(), + +FP::max() * (R(1.0) - eps), + +FP::max() * (R(1.0) - R(2.0) * eps), + -FP::max(), + -FP::max() * (R(1.0) - eps), + -FP::max() * (R(1.0) - R(2.0) * eps), + +R(0.5) * FP::max(), + +R(0.5) * FP::max() * (R(1.0) + eps), + -R(0.5) * FP::max(), + -R(0.5) * FP::max() * (R(1.0) + eps), #ifdef VML_HAVE_INF - +R(1.0/0.0), // +FP::infinity() - -R(1.0/0.0), // -FP::infinity() + +R(1.0 / 0.0), // +FP::infinity() + -R(1.0 / 0.0), // -FP::infinity() #endif #ifdef VML_HAVE_NAN - R(0.0/0.0), // FP::quiet_NaN() + R(0.0 / 0.0), // FP::quiet_NaN() #endif - +int_min, +int_max, +uint_min, +uint_max, - -int_min, -int_max, -uint_min, -uint_max, - +int_min+R(0.1), +int_max+R(0.1), +uint_min+R(0.1), +uint_max+R(0.1), - -int_min+R(0.1), -int_max+R(0.1), -uint_min+R(0.1), -uint_max+R(0.1), - +int_min-R(0.1), +int_max-R(0.1), +uint_min-R(0.1), +uint_max-R(0.1), - -int_min-R(0.1), -int_max-R(0.1), -uint_min-R(0.1), -uint_max-R(0.1), - +int_min+R(1.0), +int_max+R(1.0), +uint_min+R(1.0), +uint_max+R(1.0), - -int_min+R(1.0), -int_max+R(1.0), -uint_min+R(1.0), -uint_max+R(1.0), - +int_min-R(1.0), +int_max-R(1.0), +uint_min-R(1.0), +uint_max-R(1.0), - -int_min-R(1.0), -int_max-R(1.0), -uint_min-R(1.0), -uint_max-R(1.0), - +mantissa_max, +mantissa_max-R(1.0), +mantissa_max+R(1.0), - -mantissa_max, -mantissa_max-R(1.0), -mantissa_max+R(1.0), - +real_max, +real_max-R(1.0), +real_max+R(1.0), - -real_max, -real_max-R(1.0), -real_max+R(1.0), - -R(443.9999425), + +int_min, + +int_max, + +uint_min, + +uint_max, + -int_min, + -int_max, + -uint_min, + -uint_max, + +int_min + R(0.1), + +int_max + R(0.1), + +uint_min + R(0.1), + +uint_max + R(0.1), + -int_min + R(0.1), + -int_max + R(0.1), + -uint_min + R(0.1), + -uint_max + R(0.1), + +int_min - R(0.1), + +int_max - R(0.1), + +uint_min - R(0.1), + +uint_max - R(0.1), + -int_min - R(0.1), + -int_max - R(0.1), + -uint_min - R(0.1), + -uint_max - R(0.1), + +int_min + R(1.0), + +int_max + R(1.0), + +uint_min + R(1.0), + +uint_max + R(1.0), + -int_min + R(1.0), + -int_max + R(1.0), + -uint_min + R(1.0), + -uint_max + R(1.0), + +int_min - R(1.0), + +int_max - R(1.0), + +uint_min - R(1.0), + +uint_max - R(1.0), + -int_min - R(1.0), + -int_max - R(1.0), + -uint_min - R(1.0), + -uint_max - R(1.0), + +mantissa_max, + +mantissa_max - R(1.0), + +mantissa_max + R(1.0), + -mantissa_max, + -mantissa_max - R(1.0), + -mantissa_max + R(1.0), + +real_max, + +real_max - R(1.0), + +real_max + R(1.0), + -real_max, + -real_max - R(1.0), + -real_max + R(1.0), + -R(443.9999425), }; const int nvalues = sizeof values / sizeof *values; - - for (int i=0; i<nvalues+imax; ++i) { + + for (int i = 0; i < nvalues + imax; ++i) { const realvec_t x = - i<nvalues ? RV(values[i]) : random(R(-1.0e+10), R(+1.0e+10)); + i < nvalues ? RV(values[i]) : random(R(-1.0e+10), R(+1.0e+10)); const intvec_t n1 = random(int_t(-100), int_t(+100)); - //const intvec_t n2 = random(int_t(-1000000000), int_t(+1000000000)); + // const intvec_t n2 = random(int_t(-1000000000), int_t(+1000000000)); const intvec_t n2 = - random(std::numeric_limits<int_t>::min() / 2, // avoid overflow - std::numeric_limits<int_t>::max() / 2); + random(std::numeric_limits<int_t>::min() / 2, // avoid overflow + std::numeric_limits<int_t>::max() / 2); const realvec_t fn1 = vecmathlib::convert_float(n1); const realvec_t fn2 = vecmathlib::convert_float(n2); const realvec_t fn1h = vecmathlib::convert_float(n1) * RV(0.25); const realvec_t fn2h = vecmathlib::convert_float(n2) * RV(0.25); - check_real<IV>("convert_float", - FP::convert_float, vecmathlib::convert_float, n1, R(0.0)); - check_real<IV>("convert_float", - FP::convert_float, vecmathlib::convert_float, n2, R(0.0)); + check_real<IV>("convert_float", FP::convert_float, + vecmathlib::convert_float, n1, R(0.0)); + check_real<IV>("convert_float", FP::convert_float, + vecmathlib::convert_float, n2, R(0.0)); // Note: RV(int_max) > int_max due to rounding if (all(x >= RV(int_min) && x < RV(int_max))) { - check_int<RV>("convert_int", - FP::convert_int, vecmathlib::convert_int, x); + check_int<RV>("convert_int", FP::convert_int, vecmathlib::convert_int, + x); } // TODO: These should all have accuracy R(0.0) instead! check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, x, accuracy()); @@ -1387,218 +1409,213 @@ struct vecmathlib_test { check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, fn1h, accuracy()); check_real<RV>("ceil", vml_std::ceil, vecmathlib::ceil, fn2h, accuracy()); check_real<RV>("floor", vml_std::floor, vecmathlib::floor, x, accuracy()); - check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1, accuracy()); - check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2, accuracy()); - check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1h, accuracy()); - check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2h, accuracy()); - // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, x, accuracy()); - // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1, accuracy()); - // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2, accuracy()); - // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1h, accuracy()); - // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2h, accuracy()); + check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1, + accuracy()); + check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2, + accuracy()); + check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn1h, + accuracy()); + check_real<RV>("floor", vml_std::floor, vecmathlib::floor, fn2h, + accuracy()); + // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, x, + // accuracy()); + // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1, + // accuracy()); + // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2, + // accuracy()); + // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn1h, + // accuracy()); + // check_int<RV>("lrint", vml_std::lrint, vecmathlib::rint, fn2h, + // accuracy()); check_real<RV>("rint", vml_std::rint, vecmathlib::rint, x, accuracy()); check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn1, accuracy()); check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn2, accuracy()); check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn1h, accuracy()); check_real<RV>("rint", vml_std::rint, vecmathlib::rint, fn2h, accuracy()); check_real<RV>("round", vml_std::round, vecmathlib::round, x, accuracy()); - check_real<RV>("round", vml_std::round, vecmathlib::round, fn1, accuracy()); - check_real<RV>("round", vml_std::round, vecmathlib::round, fn2, accuracy()); - check_real<RV>("round", vml_std::round, vecmathlib::round, fn1h, accuracy()); - check_real<RV>("round", vml_std::round, vecmathlib::round, fn2h, accuracy()); + check_real<RV>("round", vml_std::round, vecmathlib::round, fn1, + accuracy()); + check_real<RV>("round", vml_std::round, vecmathlib::round, fn2, + accuracy()); + check_real<RV>("round", vml_std::round, vecmathlib::round, fn1h, + accuracy()); + check_real<RV>("round", vml_std::round, vecmathlib::round, fn2h, + accuracy()); check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, x, accuracy()); - check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1, accuracy()); - check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2, accuracy()); - check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1h, accuracy()); - check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2h, accuracy()); + check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1, + accuracy()); + check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2, + accuracy()); + check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn1h, + accuracy()); + check_real<RV>("trunc", vml_std::trunc, vecmathlib::trunc, fn2h, + accuracy()); } } - - - - static void test_asin() - { + + static void test_asin() { cout << " testing asin acos atan atan2...\n" << flush; - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x = random(R(-1.0), R(+1.0)); check_real<RV>("asin", vml_std::asin, vecmathlib::asin, x, accuracy(4)); check_real<RV>("acos", vml_std::acos, vecmathlib::acos, x, accuracy(4)); } - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x = random(R(-100.0), R(+100.0)); const realvec_t y = random(R(-100.0), R(+100.0)); check_real<RV>("atan", vml_std::atan, vecmathlib::atan, x, accuracy(5)); - check_real<RV,RV>("atan2", - vml_std::atan2, vecmathlib::atan2, x, y, accuracy(6)); + check_real<RV, RV>("atan2", vml_std::atan2, vecmathlib::atan2, x, y, + accuracy(6)); } } - - static void test_asinh() - { + + static void test_asinh() { cout << " testing asinh acosh atanh...\n" << flush; - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x = random(R(-1000.0), R(+1000.0)); - check_real<RV>("asinh", - vml_std::asinh, vecmathlib::asinh, x, accuracy(4)); + check_real<RV>("asinh", vml_std::asinh, vecmathlib::asinh, x, + accuracy(4)); } - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x = random(R(1.0), R(1000.0)); - check_real<RV>("acosh", - vml_std::acosh, vecmathlib::acosh, x, accuracy(4)); + check_real<RV>("acosh", vml_std::acosh, vecmathlib::acosh, x, + accuracy(4)); } - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x = random(R(-1.0), R(+1.0)); - check_real<RV>("atanh", - vml_std::atanh, vecmathlib::atanh, x, accuracy(5)); + check_real<RV>("atanh", vml_std::atanh, vecmathlib::atanh, x, + accuracy(5)); } } - + static real_t local_exp10(real_t x) { return pow(R(10.0), x); } - static void test_exp() - { + static void test_exp() { cout << " testing exp exp10 exp2 expm1...\n" << flush; - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x = random(R(-100.0), R(+100.0)); check_real<RV>("exp", vml_std::exp, vecmathlib::exp, x, accuracy(3)); check_real<RV>("exp10", local_exp10, vecmathlib::exp10, x, accuracy(3)); check_real<RV>("exp2", vml_std::exp2, vecmathlib::exp2, x, accuracy(3)); - check_real<RV>("expm1", - vml_std::expm1, vecmathlib::expm1, x, accuracy(3)); + check_real<RV>("expm1", vml_std::expm1, vecmathlib::expm1, x, + accuracy(3)); } } - - static void test_log() - { + + static void test_log() { cout << " testing log log10 log1p log2...\n" << flush; - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x = random(R(1.0e-10), R(1.0e+10)); check_real<RV>("log", vml_std::log, vecmathlib::log, x, accuracy(3)); - check_real<RV>("log10", - vml_std::log10, vecmathlib::log10, x, accuracy(3)); - check_real<RV>("log1p", - vml_std::log1p, vecmathlib::log1p, x, accuracy(2)); + check_real<RV>("log10", vml_std::log10, vecmathlib::log10, x, + accuracy(3)); + check_real<RV>("log1p", vml_std::log1p, vecmathlib::log1p, x, + accuracy(2)); check_real<RV>("log2", vml_std::log2, vecmathlib::log2, x, accuracy(3)); } } - - static void test_pow() - { + + static void test_pow() { cout << " testing pow...\n" << flush; - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x = random(R(0.001), R(1000.0)); const realvec_t y = random(R(-10.0), R(+10.0)); const realvec_t ya = fabs(y); const intvec_t n = random(I(-10), I(+10)); const realvec_t fn = vecmathlib::convert_float(n); - check_real<RV,RV>("pow(0,y)", - vml_std::pow, vecmathlib::pow, RV(0.0), ya, - accuracy(16)); - check_real<RV,RV>("pow(x,0)", - vml_std::pow, vecmathlib::pow, x, RV(0.0), - accuracy(16)); + check_real<RV, RV>("pow(0,y)", vml_std::pow, vecmathlib::pow, RV(0.0), ya, + accuracy(16)); + check_real<RV, RV>("pow(x,0)", vml_std::pow, vecmathlib::pow, x, RV(0.0), + accuracy(16)); // just to check check_real<RV>("log(x)", vml_std::log, vecmathlib::log, x, accuracy(3)); - check_real<RV,RV>("pow(x,y)", - vml_std::pow, vecmathlib::pow, x, y, accuracy(16)); - check_real<RV,RV>("pow(-x,n)", - vml_std::pow, vecmathlib::pow, -x, fn, accuracy(16)); + check_real<RV, RV>("pow(x,y)", vml_std::pow, vecmathlib::pow, x, y, + accuracy(16)); + check_real<RV, RV>("pow(-x,n)", vml_std::pow, vecmathlib::pow, -x, fn, + accuracy(16)); } } - - static real_t local_rcp(real_t x) { return R(1.0)/x; } - static void test_rcp() - { + + static real_t local_rcp(real_t x) { return R(1.0) / x; } + static void test_rcp() { cout << " testing / fmod rcp remainder...\n" << flush; - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x = random(R(-100.0), R(+100.0)); const realvec_t y = random(R(-100.0), R(+100.0)); const intvec_t n = random(I(-100), I(+100)); const intvec_t m = random(I(-100), I(+100)); const realvec_t fn = vecmathlib::convert_float(n); - const realvec_t fm = vecmathlib::convert_float - (m + vecmathlib::convert_int(m == intvec_t(I(0)))); - check_real<RV,RV>("/", local_div, local_div, x, y, accuracy()); + const realvec_t fm = vecmathlib::convert_float( + m + vecmathlib::convert_int(m == intvec_t(I(0)))); + check_real<RV, RV>("/", local_div, local_div, x, y, accuracy()); check_real<RV>("rcp", local_rcp, vecmathlib::rcp, x, accuracy()); - check_real<RV,RV>("fmod(x,y)", - vml_std::fmod, vecmathlib::fmod, x, y, - 2.0*accuracy(), y); - check_real<RV,RV>("fmod(x,m)", - vml_std::fmod, vecmathlib::fmod, x, fm, - 2.0*accuracy(), fm); - check_real<RV,RV>("fmod(n,y)", - vml_std::fmod, vecmathlib::fmod, fn, y, - 2.0*accuracy(), y); - check_real<RV,RV>("remainder(x,y)", - vml_std::remainder, vecmathlib::remainder, - x, y, R(2.0)*accuracy(), y); - check_real<RV,RV>("remainder(x,m)", - vml_std::remainder, vecmathlib::remainder, - x, fm, R(2.0)*accuracy(), fm); - check_real<RV,RV>("remainder(n,y)", - vml_std::remainder, vecmathlib::remainder, - fn, y, R(2.0)*accuracy(), y); + check_real<RV, RV>("fmod(x,y)", vml_std::fmod, vecmathlib::fmod, x, y, + 2.0 * accuracy(), y); + check_real<RV, RV>("fmod(x,m)", vml_std::fmod, vecmathlib::fmod, x, fm, + 2.0 * accuracy(), fm); + check_real<RV, RV>("fmod(n,y)", vml_std::fmod, vecmathlib::fmod, fn, y, + 2.0 * accuracy(), y); + check_real<RV, RV>("remainder(x,y)", vml_std::remainder, + vecmathlib::remainder, x, y, R(2.0) * accuracy(), y); + check_real<RV, RV>("remainder(x,m)", vml_std::remainder, + vecmathlib::remainder, x, fm, R(2.0) * accuracy(), fm); + check_real<RV, RV>("remainder(n,y)", vml_std::remainder, + vecmathlib::remainder, fn, y, R(2.0) * accuracy(), y); } } - - static void test_sin() - { + + static void test_sin() { cout << " testing cos sin tan...\n" << flush; - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x = random(R(-10.0), R(+10.0)); check_real<RV>("sin", vml_std::sin, vecmathlib::sin, x, accuracy(4)); check_real<RV>("cos", vml_std::cos, vecmathlib::cos, x, accuracy(4)); } - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x0 = random(R(-1.55), R(+1.55)); const intvec_t n = random(I(-10), I(+10)); const realvec_t x = x0 + vecmathlib::convert_float(n) * RV(M_PI); // tan loses accuracy near pi/2 // (by definition, not by implementation?) - check_real<RV>("tan", - vml_std::tan, vecmathlib::tan, x, R(20.0)*accuracy(5)); + check_real<RV>("tan", vml_std::tan, vecmathlib::tan, x, + R(20.0) * accuracy(5)); } } - - static void test_sinh() - { + + static void test_sinh() { cout << " testing cosh sinh tanh...\n" << flush; - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x = random(R(-10.0), R(+10.0)); check_real<RV>("sinh", vml_std::sinh, vecmathlib::sinh, x, accuracy(4)); check_real<RV>("cosh", vml_std::cosh, vecmathlib::cosh, x, accuracy(4)); check_real<RV>("tanh", vml_std::tanh, vecmathlib::tanh, x, accuracy(5)); } } - - static real_t local_rsqrt(real_t x) { return R(1.0)/sqrt(x); } - static void test_sqrt() - { + + static real_t local_rsqrt(real_t x) { return R(1.0) / sqrt(x); } + static void test_sqrt() { cout << " testing cbrt hypot rsqrt sqrt...\n" << flush; - for (int i=0; i<imax; ++i) { + for (int i = 0; i < imax; ++i) { const realvec_t x = random(R(1.0e-3), R(1.0e+3)); const realvec_t y = random(-R(1.0e+3), R(1.0e+3)); const realvec_t z = random(-R(1.0e+3), R(1.0e+3)); check_real<RV>("cbrt", vml_std::cbrt, vecmathlib::cbrt, x, accuracy()); - check_real<RV,RV>("hypot", - vml_std::hypot, vecmathlib::hypot, y, z, accuracy()); + check_real<RV, RV>("hypot", vml_std::hypot, vecmathlib::hypot, y, z, + accuracy()); check_real<RV>("rsqrt", local_rsqrt, vecmathlib::rsqrt, x, accuracy()); check_real<RV>("sqrt", vml_std::sqrt, vecmathlib::sqrt, x, accuracy()); } } - - - - static void test() - { + + static void test() { cout << "\n" << "Testing math functions for type " << realvec_t::name() << ":\n"; - + test_bool(); test_int(); test_real(); - + test_mem(); - + // Test "basic" functions first test_abs(); test_fabs(); @@ -1615,90 +1632,86 @@ struct vecmathlib_test { } }; - - -int main(int argc, char** argv) -{ +int main(int argc, char **argv) { using namespace vecmathlib; cout << "Testing math functions:\n" - << "[" VECMATHLIB_CONFIGURATION "]\n" - << flush; - - vecmathlib_test<realpseudovec<float,1> >::test(); + << "[" VECMATHLIB_CONFIGURATION "]\n" << flush; + + vecmathlib_test<realpseudovec<float, 1>>::test(); #ifdef __clang__ - vecmathlib_test<realbuiltinvec<float,1> >::test(); + vecmathlib_test<realbuiltinvec<float, 1>>::test(); #endif - vecmathlib_test<realtestvec<float,1> >::test(); + vecmathlib_test<realtestvec<float, 1>>::test(); #ifdef VECMATHLIB_HAVE_VEC_FLOAT_1 - vecmathlib_test<realvec<float,1> >::test(); + vecmathlib_test<realvec<float, 1>>::test(); #endif - vecmathlib_test<realpseudovec<float,2> >::test(); + vecmathlib_test<realpseudovec<float, 2>>::test(); #ifdef __clang__ - vecmathlib_test<realbuiltinvec<float,2> >::test(); + vecmathlib_test<realbuiltinvec<float, 2>>::test(); #endif - vecmathlib_test<realtestvec<float,2> >::test(); + vecmathlib_test<realtestvec<float, 2>>::test(); #ifdef VECMATHLIB_HAVE_VEC_FLOAT_2 - vecmathlib_test<realvec<float,2> >::test(); + vecmathlib_test<realvec<float, 2>>::test(); #endif - vecmathlib_test<realpseudovec<float,4> >::test(); + vecmathlib_test<realpseudovec<float, 4>>::test(); #ifdef __clang__ - vecmathlib_test<realbuiltinvec<float,4> >::test(); + vecmathlib_test<realbuiltinvec<float, 4>>::test(); #endif - vecmathlib_test<realtestvec<float,4> >::test(); + vecmathlib_test<realtestvec<float, 4>>::test(); #ifdef VECMATHLIB_HAVE_VEC_FLOAT_4 - vecmathlib_test<realvec<float,4> >::test(); + vecmathlib_test<realvec<float, 4>>::test(); #endif #ifdef VECMATHLIB_HAVE_VEC_FLOAT_8 - vecmathlib_test<realpseudovec<float,8> >::test(); + vecmathlib_test<realpseudovec<float, 8>>::test(); #ifdef __clang__ - vecmathlib_test<realbuiltinvec<float,8> >::test(); + vecmathlib_test<realbuiltinvec<float, 8>>::test(); #endif - vecmathlib_test<realtestvec<float,8> >::test(); - vecmathlib_test<realvec<float,8> >::test(); + vecmathlib_test<realtestvec<float, 8>>::test(); + vecmathlib_test<realvec<float, 8>>::test(); #endif #ifdef VECMATHLIB_HAVE_VEC_FLOAT_16 - vecmathlib_test<realpseudovec<float,16> >::test(); + vecmathlib_test<realpseudovec<float, 16>>::test(); #ifdef __clang__ - vecmathlib_test<realbuiltinvec<float,16> >::test(); + vecmathlib_test<realbuiltinvec<float, 16>>::test(); #endif - vecmathlib_test<realtestvec<float,16> >::test(); - vecmathlib_test<realvec<float,16> >::test(); + vecmathlib_test<realtestvec<float, 16>>::test(); + vecmathlib_test<realvec<float, 16>>::test(); #endif - - vecmathlib_test<realpseudovec<double,1> >::test(); + + vecmathlib_test<realpseudovec<double, 1>>::test(); #ifdef __clang__ - vecmathlib_test<realbuiltinvec<double,1> >::test(); + vecmathlib_test<realbuiltinvec<double, 1>>::test(); #endif - vecmathlib_test<realtestvec<double,1> >::test(); + vecmathlib_test<realtestvec<double, 1>>::test(); #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_1 - vecmathlib_test<realvec<double,1> >::test(); + vecmathlib_test<realvec<double, 1>>::test(); #endif - vecmathlib_test<realpseudovec<double,2> >::test(); + vecmathlib_test<realpseudovec<double, 2>>::test(); #ifdef __clang__ - vecmathlib_test<realbuiltinvec<double,2> >::test(); + vecmathlib_test<realbuiltinvec<double, 2>>::test(); #endif - vecmathlib_test<realtestvec<double,2> >::test(); + vecmathlib_test<realtestvec<double, 2>>::test(); #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_2 - vecmathlib_test<realvec<double,2> >::test(); + vecmathlib_test<realvec<double, 2>>::test(); #endif #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_4 - vecmathlib_test<realpseudovec<double,4> >::test(); + vecmathlib_test<realpseudovec<double, 4>>::test(); #ifdef __clang__ - vecmathlib_test<realbuiltinvec<double,4> >::test(); + vecmathlib_test<realbuiltinvec<double, 4>>::test(); #endif - vecmathlib_test<realtestvec<double,4> >::test(); - vecmathlib_test<realvec<double,4> >::test(); + vecmathlib_test<realtestvec<double, 4>>::test(); + vecmathlib_test<realvec<double, 4>>::test(); #endif #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_8 - vecmathlib_test<realpseudovec<double,8> >::test(); + vecmathlib_test<realpseudovec<double, 8>>::test(); #ifdef __clang__ - vecmathlib_test<realbuiltinvec<double,8> >::test(); + vecmathlib_test<realbuiltinvec<double, 8>>::test(); #endif - vecmathlib_test<realtestvec<double,8> >::test(); - vecmathlib_test<realvec<double,8> >::test(); + vecmathlib_test<realtestvec<double, 8>>::test(); + vecmathlib_test<realvec<double, 8>>::test(); #endif - + cout << "\n"; if (num_errors == 0) { cout << "SUCCESS"; @@ -1706,6 +1719,6 @@ int main(int argc, char** argv) cout << "FAILURE"; } cout << ": " << num_errors << " errors found\n" << flush; - + return num_errors == 0 ? 0 : 1; } diff --git a/vec_altivec_float4.h b/vec_altivec_float4.h index 14e0308..55530b4 100644 --- a/vec_altivec_float4.h +++ b/vec_altivec_float4.h @@ -13,647 +13,566 @@ #include <altivec.h> #if defined __clang__ -# define __vector vector -# define __pixel pixel -# define __bool bool +#define __vector vector +#define __pixel pixel +#define __bool bool #elif defined __gcc__ -# undef vector -# undef pixel -# undef bool +#undef vector +#undef pixel +#undef bool #elif defined __xlC__ -# define __bool bool +#define __bool bool #else -# error "Unknown compiler" +#error "Unknown compiler" #endif - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_FLOAT_4 - template<> struct boolvec<float,4>; - template<> struct intvec<float,4>; - template<> struct realvec<float,4>; - - - - template<> - struct boolvec<float,4>: floatprops<float> - { - static int const size = 4; - typedef bool scalar_t; - typedef __vector __bool int bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values are -1, false values are 0 - static uint_t from_bool(bool a) { return -int_t(a); } - static bool to_bool(uint_t a) { return a; } - public: - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v((bvector_t)vec_splats(from_bool(a))) {} - boolvec(bool const* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - - operator bvector_t() const { return v; } - bool operator[](int n) const - { - return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n)); - } - boolvec& set_elt(int n, bool a) - { - return - vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return vec_nor(v, v); } - - boolvec operator&&(boolvec x) const { return vec_and(v, x.v); } - boolvec operator||(boolvec x) const { return vec_or(v, x.v); } - // boolvec operator==(boolvec x) const { return !(*this!=x); } - boolvec operator==(boolvec x) const; // defined after intvec - boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); } - - bool all() const { return vec_all_ne(v, BV(false).v); } - bool any() const { return vec_any_ne(v, BV(false).v); } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<float,4>: floatprops<float> - { - static int const size = 4; - typedef int_t scalar_t; - typedef __vector signed int ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(vec_splats(a)) {} - intvec(int_t const* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - static intvec iota() { return (__vector signed int){0, 1, 2, 3}; } - - operator ivector_t() const { return v; } - int_t operator[](int n) const - { - return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n); - } - intvec_t& set_elt(int n, int_t a) - { - return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this; - } - - - - // Vector casts do not change the bit battern - boolvec_t as_bool() const { return (__vector __bool int)v; } - boolvec_t convert_bool() const { return *this != IV(0); } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - intvec operator+() const { return *this; } - intvec operator-() const - { +template <> struct boolvec<float, 4>; +template <> struct intvec<float, 4>; +template <> struct realvec<float, 4>; + +template <> struct boolvec<float, 4> : floatprops<float> { + static int const size = 4; + typedef bool scalar_t; + typedef __vector __bool int bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + +private: + // true values are -1, false values are 0 + static uint_t from_bool(bool a) { return -int_t(a); } + static bool to_bool(uint_t a) { return a; } + +public: + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v((bvector_t)vec_splats(from_bool(a))) {} + boolvec(bool const *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + + operator bvector_t() const { return v; } + bool operator[](int n) const { + return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n)); + } + boolvec &set_elt(int n, bool a) { + return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)), + *this; + } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec operator!() const { return vec_nor(v, v); } + + boolvec operator&&(boolvec x) const { return vec_and(v, x.v); } + boolvec operator||(boolvec x) const { return vec_or(v, x.v); } + // boolvec operator==(boolvec x) const { return !(*this!=x); } + boolvec operator==(boolvec x) const; // defined after intvec + boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); } + + bool all() const { return vec_all_ne(v, BV(false).v); } + bool any() const { return vec_any_ne(v, BV(false).v); } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<float, 4> : floatprops<float> { + static int const size = 4; + typedef int_t scalar_t; + typedef __vector signed int ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x) : v(x) {} + intvec(int_t a) : v(vec_splats(a)) {} + intvec(int_t const *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + static intvec iota() { return (__vector signed int){0, 1, 2, 3}; } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { + return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n); + } + intvec_t &set_elt(int n, int_t a) { + return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this; + } + + // Vector casts do not change the bit battern + boolvec_t as_bool() const { return (__vector __bool int)v; } + boolvec_t convert_bool() const { return *this != IV(0); } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + intvec operator+() const { return *this; } + intvec operator-() const { #if defined __xlC_ - return vec_neg(v); + return vec_neg(v); #else - // vec_neg does not exist in clang - return IV(I(0)) - *this; + // vec_neg does not exist in clang + return IV(I(0)) - *this; #endif + } + + intvec operator+(intvec x) const { return vec_add(v, x.v); } + intvec operator-(intvec x) const { return vec_sub(v, x.v); } + + intvec &operator+=(intvec const &x) { return *this = *this + x; } + intvec &operator-=(intvec const &x) { return *this = *this - x; } + + intvec operator~() const { return vec_nor(v, v); } + + intvec operator&(intvec x) const { return vec_and(v, x.v); } + intvec operator|(intvec x) const { return vec_or(v, x.v); } + intvec operator^(intvec x) const { return vec_xor(v, x.v); } + + intvec &operator&=(intvec const &x) { return *this = *this & x; } + intvec &operator|=(intvec const &x) { return *this = *this | x; } + intvec &operator^=(intvec const &x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const; + + intvec_t lsr(int_t n) const { return lsr(IV(n)); } + intvec_t rotate(int_t n) const; + intvec operator>>(int_t n) const { return *this >> IV(n); } + intvec operator<<(int_t n) const { return *this << IV(n); } + intvec &operator>>=(int_t n) { return *this = *this >> n; } + intvec &operator<<=(int_t n) { return *this = *this << n; } + + intvec_t lsr(intvec_t n) const { + return vec_sr(v, (__vector unsigned int)n.v); + } + intvec_t rotate(intvec_t n) const; + intvec operator>>(intvec n) const { + return vec_sra(v, (__vector unsigned int)n.v); + } + intvec operator<<(intvec n) const { + return vec_sl(v, (__vector unsigned int)n.v); + } + intvec &operator>>=(intvec n) { return *this = *this >> n; } + intvec &operator<<=(intvec n) { return *this = *this << n; } + + intvec_t clz() const; + intvec_t popcount() const; + + boolvec_t operator==(intvec const &x) const { return vec_cmpeq(v, x.v); } + boolvec_t operator!=(intvec const &x) const { return !(*this == x); } + boolvec_t operator<(intvec const &x) const { return vec_cmplt(v, x.v); } + boolvec_t operator<=(intvec const &x) const { return !(*this > x); } + boolvec_t operator>(intvec const &x) const { return vec_cmpgt(v, x.v); } + boolvec_t operator>=(intvec const &x) const { return !(*this < x); } + + intvec_t abs() const { return vec_abs(v); } + boolvec_t isignbit() const { return (*this >> (bits - 1)).as_bool(); } + intvec_t max(intvec_t x) const { return vec_max(v, x.v); } + intvec_t min(intvec_t x) const { return vec_min(v, x.v); } +}; + +template <> struct realvec<float, 4> : floatprops<float> { + static int const size = 4; + typedef real_t scalar_t; + typedef __vector float vector_t; + static int const alignment = sizeof(vector_t); + + static char const *name() { return "<Altivec:4*float>"; } + void barrier() { __asm__("" : "+v"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x) : v(x) {} + realvec(real_t a) : v(vec_splats(a)) {} + realvec(real_t const *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + + operator vector_t() const { return v; } + real_t operator[](int n) const { + return vecmathlib::get_elt<RV, vector_t, real_t>(v, n); + } + realvec_t &set_elt(int n, real_t a) { + return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this; + } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return vec_ld(0, p); + } + static realvec_t loadu(real_t const *p) { + realvec_t v0 = vec_ld(0, p); + realvec_t v1 = vec_ld(15, p); + return vec_perm(v0.v, v1.v, vec_lvsl(0, p)); + } + static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff); + return loadu(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); } - - intvec operator+(intvec x) const { return vec_add(v, x.v); } - intvec operator-(intvec x) const { return vec_sub(v, x.v); } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - - - - intvec operator~() const { return vec_nor(v, v); } - - intvec operator&(intvec x) const { return vec_and(v, x.v); } - intvec operator|(intvec x) const { return vec_or(v, x.v); } - intvec operator^(intvec x) const { return vec_xor(v, x.v); } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const; - - - - intvec_t lsr(int_t n) const { return lsr(IV(n)); } - intvec_t rotate(int_t n) const; - intvec operator>>(int_t n) const { return *this >> IV(n); } - intvec operator<<(int_t n) const { return *this << IV(n); } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<<n; } - - intvec_t lsr(intvec_t n) const - { - return vec_sr(v, (__vector unsigned int)n.v); - } - intvec_t rotate(intvec_t n) const; - intvec operator>>(intvec n) const - { - return vec_sra(v, (__vector unsigned int)n.v); - } - intvec operator<<(intvec n) const - { - return vec_sl(v, (__vector unsigned int)n.v); - } - intvec& operator>>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<<n; } - - intvec_t clz() const; - intvec_t popcount() const; - - - - boolvec_t operator==(intvec const& x) const { return vec_cmpeq(v, x.v); } - boolvec_t operator!=(intvec const& x) const { return !(*this == x); } - boolvec_t operator<(intvec const& x) const { return vec_cmplt(v, x.v); } - boolvec_t operator<=(intvec const& x) const { return !(*this > x); } - boolvec_t operator>(intvec const& x) const { return vec_cmpgt(v, x.v); } - boolvec_t operator>=(intvec const& x) const { return !(*this < x); } - - intvec_t abs() const { return vec_abs(v); } - boolvec_t isignbit() const { return (*this >> (bits-1)).as_bool(); } - intvec_t max(intvec_t x) const { return vec_max(v, x.v); } - intvec_t min(intvec_t x) const { return vec_min(v, x.v); } - }; - - - - template<> - struct realvec<float,4>: floatprops<float> - { - static int const size = 4; - typedef real_t scalar_t; - typedef __vector float vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return "<Altivec:4*float>"; } - void barrier() { __asm__("": "+v"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(vec_splats(a)) {} - realvec(real_t const* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - - operator vector_t() const { return v; } - real_t operator[](int n) const - { - return vecmathlib::get_elt<RV,vector_t,real_t>(v, n); - } - realvec_t& set_elt(int n, real_t a) - { - return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this; - } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return vec_ld(0, p); - } - static realvec_t loadu(real_t const* p) - { - realvec_t v0 = vec_ld(0, p); - realvec_t v1 = vec_ld(15, p); - return vec_perm(v0.v, v1.v, vec_lvsl(0, p)); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - vec_st(v, 0, p); - } - void storeu(real_t* p) const - { - // Vector stores would require vector loads, which would need to - // be atomic - // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas - p[0] = (*this)[0]; - p[1] = (*this)[1]; - p[2] = (*this)[2]; - p[3] = (*this)[3]; - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - // Use vec_ste? - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - // Use vec_ste? - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } + } + realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff, m); + return loadu(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + vec_st(v, 0, p); + } + void storeu(real_t *p) const { + // Vector stores would require vector loads, which would need to + // be atomic + // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> + // for good ideas + p[0] = (*this)[0]; + p[1] = (*this)[1]; + p[2] = (*this)[2]; + p[3] = (*this)[3]; + } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff); + storeu(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + // Use vec_ste? + if (m.m[0]) + p[0] = (*this)[0]; + if (m.m[1]) + p[1] = (*this)[1]; + if (m.m[2]) + p[2] = (*this)[2]; + if (m.m[3]) + p[3] = (*this)[3]; } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); + } + void storeu(real_t *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + // Use vec_ste? + if (m.m[0]) + p[0] = (*this)[0]; + if (m.m[1]) + p[1] = (*this)[1]; + if (m.m[2]) + p[2] = (*this)[2]; + if (m.m[3]) + p[3] = (*this)[3]; } - - - - intvec_t as_int() const { return (__vector signed int) v; } - intvec_t convert_int() const - { + } + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff, m); + storeu(p + ioff, m); + } + + intvec_t as_int() const { return (__vector signed int)v; } + intvec_t convert_int() const { #if defined __xlC__ - return vec_cts(v, 0); + return vec_cts(v, 0); #else - // vec_cts leads to an ICE in clang - return MF::vml_convert_int(*this); + // vec_cts leads to an ICE in clang + return MF::vml_convert_int(*this); #endif - } - - - - realvec operator+() const { return *this; } - realvec operator-() const - { + } + + realvec operator+() const { return *this; } + realvec operator-() const { #if defined __xlC_ - return vec_neg(v); + return vec_neg(v); #else - // vec_neg does not exist in clang - return RV(0.0) - *this; + // vec_neg does not exist in clang + return RV(0.0) - *this; #endif - } - - realvec operator+(realvec x) const { return vec_add(v, x.v); } - realvec operator-(realvec x) const { return vec_sub(v, x.v); } - realvec operator*(realvec x) const { + } + + realvec operator+(realvec x) const { return vec_add(v, x.v); } + realvec operator-(realvec x) const { return vec_sub(v, x.v); } + realvec operator*(realvec x) const { #if defined __xlC__ - return vec_mul(v, x.v); + return vec_mul(v, x.v); #else - // vec_mul does not exist in clang - return vec_madd(v, x.v, RV(0.0).v); + // vec_mul does not exist in clang + return vec_madd(v, x.v, RV(0.0).v); #endif - } - realvec operator/(realvec x) const { + } + realvec operator/(realvec x) const { #if defined __xlC__ - return vec_div(v, x.v); + return vec_div(v, x.v); #else - // vec_div does not exist in clang - return *this * x.rcp(); + // vec_div does not exist in clang + return *this * x.rcp(); #endif - } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t maxval() const - { - return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]), - vml_std::fmax((*this)[2], (*this)[3])); - } - real_t minval() const - { - return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]), - vml_std::fmin((*this)[2], (*this)[3])); - } - real_t prod() const - { - return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; - } - real_t sum() const - { - return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; - } - - - - boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); } - boolvec_t operator!=(realvec const& x) const { return ! (*this == x); } - boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); } - boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); } - boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); } - boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const { return vec_ceil(v); } - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return vec_abs(v); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const { return vec_floor(v); } - realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); } - realvec fmax(realvec y) const { return vec_max(v, y.v); } - realvec fmin(realvec y) const { return vec_min(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return MF::vml_isnan(*this); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return vec_madd(v, y.v, z.v); - } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const - { - realvec x = *this; - realvec r = vec_re(v); // this is only an approximation - // TODO: use fma - // Note: don't rewrite this expression, this may introduce - // cancellation errors - r += r * (RV(1.0) - x*r); // one Newton iteration (see vml_rcp) - return r; - } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const { return vec_round(v); /* sic! */ } - realvec round() const { return MF::vml_round(*this); } - realvec rsqrt() const - { - realvec x = *this; - realvec r = vec_rsqrte(x.v); // this is only an approximation - // TODO: use fma - // one Newton iteration (see vml_rsqrt) - r += RV(0.5)*r * (RV(1.0) - x * r*r); - return r; - } - boolvec_t signbit() const { return MF::vml_signbit(*this); } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - realvec sqrt() const { + } + + realvec &operator+=(realvec const &x) { return *this = *this + x; } + realvec &operator-=(realvec const &x) { return *this = *this - x; } + realvec &operator*=(realvec const &x) { return *this = *this * x; } + realvec &operator/=(realvec const &x) { return *this = *this / x; } + + real_t maxval() const { + return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]), + vml_std::fmax((*this)[2], (*this)[3])); + } + real_t minval() const { + return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]), + vml_std::fmin((*this)[2], (*this)[3])); + } + real_t prod() const { + return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; + } + real_t sum() const { + return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; + } + + boolvec_t operator==(realvec const &x) const { return vec_cmpeq(v, x.v); } + boolvec_t operator!=(realvec const &x) const { return !(*this == x); } + boolvec_t operator<(realvec const &x) const { return vec_cmplt(v, x.v); } + boolvec_t operator<=(realvec const &x) const { return vec_cmple(v, x.v); } + boolvec_t operator>(realvec const &x) const { return vec_cmpgt(v, x.v); } + boolvec_t operator>=(realvec const &x) const { return vec_cmpge(v, x.v); } + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const { return vec_ceil(v); } + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return vec_abs(v); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const { return vec_floor(v); } + realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); } + realvec fmax(realvec y) const { return vec_max(v, y.v); } + realvec fmin(realvec y) const { return vec_min(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec_t mad(realvec_t y, realvec_t z) const { + return vec_madd(v, y.v, z.v); + } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const { + realvec x = *this; + realvec r = vec_re(v); // this is only an approximation + // TODO: use fma + // Note: don't rewrite this expression, this may introduce + // cancellation errors + r += r * (RV(1.0) - x * r); // one Newton iteration (see vml_rcp) + return r; + } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const { return vec_round(v); /* sic! */ } + realvec round() const { return MF::vml_round(*this); } + realvec rsqrt() const { + realvec x = *this; + realvec r = vec_rsqrte(x.v); // this is only an approximation + // TODO: use fma + // one Newton iteration (see vml_rsqrt) + r += RV(0.5) * r * (RV(1.0) - x * r * r); + return r; + } + boolvec_t signbit() const { return MF::vml_signbit(*this); } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + realvec sqrt() const { #if defined __xlC__ - return vec_sqrt(v); + return vec_sqrt(v); #else - return *this * rsqrt(); + return *this * rsqrt(); #endif - } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const { return vec_trunc(v); } - }; - - - - // boolvec definitions - - inline intvec<float,4> boolvec<float,4>::as_int() const - { - return (__vector signed int) v; - } - - inline intvec<float,4> boolvec<float,4>::convert_int() const - { - return -(__vector signed int)v; - } - - inline boolvec<float,4> boolvec<float,4>::operator==(boolvec_t x) const - { - return as_int() == x.as_int(); - } - - inline - boolvec<float,4> boolvec<float,4>::ifthen(boolvec_t x, boolvec_t y) const - { - return vec_sel(y.v, x.v, v); - } - - inline - intvec<float,4> boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const - { - return vec_sel(y.v, x.v, v); - } - - inline - realvec<float,4> boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const - { - return vec_sel(y.v, x.v, v); - } - - - - // intvec definitions - - inline realvec<float,4> intvec<float,4>::as_float() const - { - return (__vector float)v; - } - - inline intvec<float,4> intvec<float,4>::bitifthen(intvec_t x, - intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - inline intvec<float,4> intvec<float,4>::clz() const - { - return MF::vml_clz(*this); - } - - inline realvec<float,4> intvec<float,4>::convert_float() const - { + } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const { return vec_trunc(v); } +}; + +// boolvec definitions + +inline intvec<float, 4> boolvec<float, 4>::as_int() const { + return (__vector signed int)v; +} + +inline intvec<float, 4> boolvec<float, 4>::convert_int() const { + return -(__vector signed int)v; +} + +inline boolvec<float, 4> boolvec<float, 4>::operator==(boolvec_t x) const { + return as_int() == x.as_int(); +} + +inline boolvec<float, 4> boolvec<float, 4>::ifthen(boolvec_t x, + boolvec_t y) const { + return vec_sel(y.v, x.v, v); +} + +inline intvec<float, 4> boolvec<float, 4>::ifthen(intvec_t x, + intvec_t y) const { + return vec_sel(y.v, x.v, v); +} + +inline realvec<float, 4> boolvec<float, 4>::ifthen(realvec_t x, + realvec_t y) const { + return vec_sel(y.v, x.v, v); +} + +// intvec definitions + +inline realvec<float, 4> intvec<float, 4>::as_float() const { + return (__vector float)v; +} + +inline intvec<float, 4> intvec<float, 4>::bitifthen(intvec_t x, + intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); +} + +inline intvec<float, 4> intvec<float, 4>::clz() const { + return MF::vml_clz(*this); +} + +inline realvec<float, 4> intvec<float, 4>::convert_float() const { #if defined __xlC__ - return vec_ctf(v, 0); + return vec_ctf(v, 0); #else - // vec_ctf leads to an ICE in clang - return MF::vml_convert_float(*this); + // vec_ctf leads to an ICE in clang + return MF::vml_convert_float(*this); #endif - } - - inline intvec<float,4> intvec<float,4>::popcount() const - { - return MF::vml_popcount(*this); - } - - inline intvec<float,4> intvec<float,4>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); - } - - inline intvec<float,4> intvec<float,4>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); - } - +} + +inline intvec<float, 4> intvec<float, 4>::popcount() const { + return MF::vml_popcount(*this); +} + +inline intvec<float, 4> intvec<float, 4>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +inline intvec<float, 4> intvec<float, 4>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + } // namespace vecmathlib -#endif // #ifndef VEC_ALTIVEC_FLOAT4_H +#endif // #ifndef VEC_ALTIVEC_FLOAT4_H diff --git a/vec_avx_double4.h b/vec_avx_double4.h index 1352712..f01e74c 100644 --- a/vec_avx_double4.h +++ b/vec_avx_double4.h @@ -12,288 +12,244 @@ // AVX intrinsics #include <immintrin.h> - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_DOUBLE_4 - template<> struct boolvec<double,4>; - template<> struct intvec<double,4>; - template<> struct realvec<double,4>; - - - - template<> - struct boolvec<double,4>: floatprops<double> - { - static int const size = 4; - typedef bool scalar_t; - typedef __m256d bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values have the sign bit set, false values have it unset - static uint_t from_bool(bool a) { return - uint_t(a); } - static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } - public: - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): - v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {} - boolvec(bool const* as): - v(_mm256_castsi256_pd(_mm256_set_epi64x(from_bool(as[3]), - from_bool(as[2]), - from_bool(as[1]), - from_bool(as[0])))) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const - { - return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n)); - } - boolvec_t& set_elt(int n, bool a) - { - return - vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec_t operator!() const { return _mm256_xor_pd(boolvec(true), v); } - - boolvec_t operator&&(boolvec_t x) const { return _mm256_and_pd(v, x.v); } - boolvec_t operator||(boolvec_t x) const { return _mm256_or_pd(v, x.v); } - boolvec_t operator==(boolvec_t x) const { return !(*this!=x); } - boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_pd(v, x.v); } - - bool all() const - { - // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3]; - return ! (! *this).any(); - } - bool any() const - { - // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3]; - return ! bool(_mm256_testz_pd(v, v)); - } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<double,4>: floatprops<double> - { - static int const size = 4; - typedef int_t scalar_t; - typedef __m256i ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm256_set1_epi64x(a)) {} - intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {} - static intvec_t iota() { return _mm256_set_epi64x(3, 2, 1, 0); } - - operator ivector_t() const { return v; } - int_t operator[](int n) const - { - return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n); - } - intvec_t& set_elt(int n, int_t a) - { - return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this; - } - - - - boolvec_t as_bool() const { return _mm256_castsi256_pd(v); } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true +template <> struct boolvec<double, 4>; +template <> struct intvec<double, 4>; +template <> struct realvec<double, 4>; + +template <> struct boolvec<double, 4> : floatprops<double> { + static int const size = 4; + typedef bool scalar_t; + typedef __m256d bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + +private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return -uint_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + +public: + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {} + boolvec(bool const *as) + : v(_mm256_castsi256_pd( + _mm256_set_epi64x(from_bool(as[3]), from_bool(as[2]), + from_bool(as[1]), from_bool(as[0])))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { + return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n)); + } + boolvec_t &set_elt(int n, bool a) { + return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)), + *this; + } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec_t operator!() const { return _mm256_xor_pd(boolvec(true), v); } + + boolvec_t operator&&(boolvec_t x) const { return _mm256_and_pd(v, x.v); } + boolvec_t operator||(boolvec_t x) const { return _mm256_or_pd(v, x.v); } + boolvec_t operator==(boolvec_t x) const { return !(*this != x); } + boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_pd(v, x.v); } + + bool all() const { + // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3]; + return !(!*this).any(); + } + bool any() const { + // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3]; + return !bool(_mm256_testz_pd(v, v)); + } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<double, 4> : floatprops<double> { + static int const size = 4; + typedef int_t scalar_t; + typedef __m256i ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x) : v(x) {} + intvec(int_t a) : v(_mm256_set1_epi64x(a)) {} + intvec(int_t const *as) : v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {} + static intvec_t iota() { return _mm256_set_epi64x(3, 2, 1, 0); } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { + return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n); + } + intvec_t &set_elt(int n, int_t a) { + return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this; + } + + boolvec_t as_bool() const { return _mm256_castsi256_pd(v); } + boolvec_t convert_bool() const { +// Result: convert_bool(0)=false, convert_bool(else)=true #ifdef __AVX2__ - return *this != IV(I(0)); + return *this != IV(I(0)); #else - // There is no intrinsic to compare to zero. Instead, we check - // whether x is positive and x-1 is negative. - intvec_t x = *this; - // We know that boolvec_t values depend only on the sign bit - // return (~(x-1) | x).as_bool(); - // return x.as_bool() || !(x-1).as_bool(); - return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); + // There is no intrinsic to compare to zero. Instead, we check + // whether x is positive and x-1 is negative. + intvec_t x = *this; + // We know that boolvec_t values depend only on the sign bit + // return (~(x-1) | x).as_bool(); + // return x.as_bool() || !(x-1).as_bool(); + return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); #endif - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec_t operator+() const { return *this; } - intvec_t operator-() const { return IV(I(0)) - *this; } - - intvec_t operator+(intvec_t x) const - { + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + // Note: not all arithmetic operations are supported! + + intvec_t operator+() const { return *this; } + intvec_t operator-() const { return IV(I(0)) - *this; } + + intvec_t operator+(intvec_t x) const { #ifdef __AVX2__ - return _mm256_add_epi64(v, x.v); + return _mm256_add_epi64(v, x.v); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_add_epi64(vlo, xvlo); - vhi = _mm_add_epi64(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_add_epi64(vlo, xvlo); + vhi = _mm_add_epi64(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec_t operator-(intvec_t x) const - { + } + intvec_t operator-(intvec_t x) const { #ifdef __AVX2__ - return _mm256_sub_epi64(v, x.v); + return _mm256_sub_epi64(v, x.v); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_sub_epi64(vlo, xvlo); - vhi = _mm_sub_epi64(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_sub_epi64(vlo, xvlo); + vhi = _mm_sub_epi64(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - - intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; } - intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; } - - - - intvec_t operator~() const { return IV(~U(0)) ^ *this; } - - intvec_t operator&(intvec_t x) const - { + } + + intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; } + intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; } + + intvec_t operator~() const { return IV(~U(0)) ^ *this; } + + intvec_t operator&(intvec_t x) const { #ifdef __AVX2__ - return _mm256_and_si256(v, x.v); + return _mm256_and_si256(v, x.v); #else - return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(v), - _mm256_castsi256_pd(x.v))); + return _mm256_castpd_si256( + _mm256_and_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v))); #endif - } - intvec_t operator|(intvec_t x) const - { + } + intvec_t operator|(intvec_t x) const { #ifdef __AVX2__ - return _mm256_or_si256(v, x.v); + return _mm256_or_si256(v, x.v); #else - return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(v), - _mm256_castsi256_pd(x.v))); + return _mm256_castpd_si256( + _mm256_or_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v))); #endif - } - intvec_t operator^(intvec_t x) const - { + } + intvec_t operator^(intvec_t x) const { #ifdef __AVX2__ - return _mm256_xor_si256(v, x.v); + return _mm256_xor_si256(v, x.v); #else - return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(v), - _mm256_castsi256_pd(x.v))); + return _mm256_castpd_si256( + _mm256_xor_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v))); #endif - } - - intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; } - intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; } - intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const; - - - - intvec_t lsr(int_t n) const - { + } + + intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; } + intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; } + intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const; + + intvec_t lsr(int_t n) const { #ifdef __AVX2__ - return _mm256_srli_epi64(v, n); + return _mm256_srli_epi64(v, n); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_srli_epi64(vlo, n); - vhi = _mm_srli_epi64(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_srli_epi64(vlo, n); + vhi = _mm_srli_epi64(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec_t rotate(int_t n) const; - intvec_t operator>>(int_t n) const - { + } + intvec_t rotate(int_t n) const; + intvec_t operator>>(int_t n) const { #ifdef __AVX2__ - // There is no _mm256_srai_epi64. To emulate it, add 0x80000000 - // before shifting, and subtract the shifted 0x80000000 after - // shifting - intvec_t offset = U(1) << (bits-1); - return (*this + offset).lsr(n) - offset.lsr(n); + // There is no _mm256_srai_epi64. To emulate it, add 0x80000000 + // before shifting, and subtract the shifted 0x80000000 after + // shifting + intvec_t offset = U(1) << (bits - 1); + return (*this + offset).lsr(n) - offset.lsr(n); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - // There is no _mm_srai_epi64. To emulate it, add 0x80000000 - // before shifting, and subtract the shifted 0x80000000 after - // shifting + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); +// There is no _mm_srai_epi64. To emulate it, add 0x80000000 +// before shifting, and subtract the shifted 0x80000000 after +// shifting #if 0 __m128i signmask01 = _mm_sub_epi64(_mm_set1_epi64x(0), _mm_srli_epi64(vlo, 63)); @@ -306,532 +262,445 @@ namespace vecmathlib { vlo = _mm_xor_si128(signmask01, vlo); vhi = _mm_xor_si128(signmask23, vhi); #else - // Convert signed to unsiged - vlo = _mm_add_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1))); - vhi = _mm_add_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1))); - // Shift - vlo = _mm_srli_epi64(vlo, n); - vhi = _mm_srli_epi64(vhi, n); - // Undo conversion - vlo = _mm_sub_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1-n))); - vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1-n))); + // Convert signed to unsiged + vlo = _mm_add_epi64(vlo, _mm_set1_epi64x(U(1) << (bits - 1))); + vhi = _mm_add_epi64(vhi, _mm_set1_epi64x(U(1) << (bits - 1))); + // Shift + vlo = _mm_srli_epi64(vlo, n); + vhi = _mm_srli_epi64(vhi, n); + // Undo conversion + vlo = _mm_sub_epi64(vlo, _mm_set1_epi64x(U(1) << (bits - 1 - n))); + vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits - 1 - n))); #endif - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec_t operator<<(int_t n) const - { + } + intvec_t operator<<(int_t n) const { #ifdef __AVX2__ - return _mm256_slli_epi64(v, n); + return _mm256_slli_epi64(v, n); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_slli_epi64(vlo, n); - vhi = _mm_slli_epi64(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_slli_epi64(vlo, n); + vhi = _mm_slli_epi64(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec_t& operator>>=(int_t n) { return *this=*this>>n; } - intvec_t& operator<<=(int_t n) { return *this=*this<<n; } - - intvec_t lsr(intvec_t n) const - { + } + intvec_t &operator>>=(int_t n) { return *this = *this >> n; } + intvec_t &operator<<=(int_t n) { return *this = *this << n; } + + intvec_t lsr(intvec_t n) const { #ifdef __AVX2__ - return _mm256_srlv_epi64(v, n.v); + return _mm256_srlv_epi64(v, n.v); #else - intvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, U((*this)[i]) >> U(n[i])); - } - return r; -#endif + intvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, U((*this)[i]) >> U(n[i])); } - intvec_t rotate(intvec_t n) const; - intvec_t operator>>(intvec_t n) const - { + return r; +#endif + } + intvec_t rotate(intvec_t n) const; + intvec_t operator>>(intvec_t n) const { #ifdef __AVX2__ - // See operator>> above - intvec_t offset = U(1) << (bits-1); - return (*this + offset).lsr(n) - offset.lsr(n); + // See operator>> above + intvec_t offset = U(1) << (bits - 1); + return (*this + offset).lsr(n) - offset.lsr(n); #else - intvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] >> n[i]); - } - return r; -#endif + intvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] >> n[i]); } - intvec_t operator<<(intvec_t n) const - { + return r; +#endif + } + intvec_t operator<<(intvec_t n) const { #ifdef __AVX2__ - return _mm256_sllv_epi64(v, n.v); + return _mm256_sllv_epi64(v, n.v); #else - intvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] << n[i]); - } - return r; -#endif + intvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] << n[i]); } - intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; } - intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; } - - intvec_t clz() const; - intvec_t popcount() const; - - - - boolvec_t operator==(intvec_t const& x) const - { + return r; +#endif + } + intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; } + intvec_t &operator<<=(intvec_t n) { return *this = *this << n; } + + intvec_t clz() const; + intvec_t popcount() const; + + boolvec_t operator==(intvec_t const &x) const { #ifdef __AVX2__ - return _mm256_castsi256_pd(_mm256_cmpeq_epi64(v, x.v)); + return _mm256_castsi256_pd(_mm256_cmpeq_epi64(v, x.v)); #else - return ! (*this != x); + return !(*this != x); #endif - } - boolvec_t operator!=(intvec_t const& x) const - { + } + boolvec_t operator!=(intvec_t const &x) const { #ifdef __AVX2__ - return ! (*this == x); + return !(*this == x); #else - return (*this ^ x).convert_bool(); + return (*this ^ x).convert_bool(); #endif - } - boolvec_t operator<(intvec_t const& x) const - { + } + boolvec_t operator<(intvec_t const &x) const { #ifdef __AVX2__ - return _mm256_castsi256_pd(_mm256_cmpgt_epi64(x.v, v)); + return _mm256_castsi256_pd(_mm256_cmpgt_epi64(x.v, v)); #else - // return (*this - x).as_bool(); - boolvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] < x[i]); - } - return r; -#endif - } - boolvec_t operator<=(intvec_t const& x) const - { - return ! (*this > x); + // return (*this - x).as_bool(); + boolvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] < x[i]); } - boolvec_t operator>(intvec_t const& x) const - { - return x < *this; - } - boolvec_t operator>=(intvec_t const& x) const - { - return ! (*this < x); - } - - intvec_t abs() const; - boolvec_t isignbit() const { return as_bool(); } - intvec_t max(intvec_t x) const; - intvec_t min(intvec_t x) const; - }; - - - - template<> - struct realvec<double,4>: floatprops<double> - { - static int const size = 4; - typedef real_t scalar_t; - typedef __m256d vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { + return r; +#endif + } + boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); } + boolvec_t operator>(intvec_t const &x) const { return x < *this; } + boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); } + + intvec_t abs() const; + boolvec_t isignbit() const { return as_bool(); } + intvec_t max(intvec_t x) const; + intvec_t min(intvec_t x) const; +}; + +template <> struct realvec<double, 4> : floatprops<double> { + static int const size = 4; + typedef real_t scalar_t; + typedef __m256d vector_t; + static int const alignment = sizeof(vector_t); + + static char const *name() { #ifdef __AVX2__ - return "<AVX2:4*double>"; + return "<AVX2:4*double>"; #else - return "<AVX:4*double>"; + return "<AVX:4*double>"; #endif + } + void barrier() { __asm__("" : "+x"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x) : v(x) {} + realvec(real_t a) : v(_mm256_set1_pd(a)) {} + realvec(real_t const *as) : v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { + return vecmathlib::get_elt<RV, vector_t, real_t>(v, n); + } + realvec_t &set_elt(int n, real_t a) { + return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this; + } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm256_load_pd(p); + } + static realvec_t loadu(real_t const *p) { return _mm256_loadu_pd(p); } + static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff); + return loadu(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); } - void barrier() { __asm__("": "+x"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm256_set1_pd(a)) {} - realvec(real_t const* as): v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const - { - return vecmathlib::get_elt<RV,vector_t,real_t>(v, n); - } - realvec_t& set_elt(int n, real_t a) - { - return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this; - } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm256_load_pd(p); - } - static realvec_t loadu(real_t const* p) - { - return _mm256_loadu_pd(p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm256_store_pd(p, v); - } - void storeu(real_t* p) const - { - return _mm256_storeu_pd(p, v); - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - _mm256_maskstore_pd(p, m.m.as_int(), v); - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - for (int d=0; d<size; ++d) { - if (m.m[d]) p[d] = (*this)[d]; - } - } + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); + } + realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff, m); + return loadu(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm256_store_pd(p, v); + } + void storeu(real_t *p) const { return _mm256_storeu_pd(p, v); } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff); + storeu(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + _mm256_maskstore_pd(p, m.m.as_int(), v); } - - - - intvec_t as_int() const { return _mm256_castpd_si256(v); } - intvec_t convert_int() const - { - intvec_t r; - for (int d=0; d<size; ++d) { - r.set_elt(d, floatprops::convert_int((*this)[d])); + } + void storeu(real_t *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + for (int d = 0; d < size; ++d) { + if (m.m[d]) + p[d] = (*this)[d]; } - return r; - } - - - - realvec_t operator+() const { return *this; } - realvec_t operator-() const { return RV(0.0) - *this; } - - realvec_t operator+(realvec_t x) const { return _mm256_add_pd(v, x.v); } - realvec_t operator-(realvec_t x) const { return _mm256_sub_pd(v, x.v); } - realvec_t operator*(realvec_t x) const { return _mm256_mul_pd(v, x.v); } - realvec_t operator/(realvec_t x) const { return _mm256_div_pd(v, x.v); } - - realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; } - realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; } - realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; } - realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; } - - real_t maxval() const - { - // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]), - // vml_std::fmax((*this)[2], (*this)[3])); - realvec_t x0123 = *this; - realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101); - realvec_t y0022 = x0123.fmax(x1032); - return vml_std::fmax(y0022[0], y0022[2]); - } - real_t minval() const - { - // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]), - // vml_std::fmin((*this)[2], (*this)[3])); - realvec_t x0123 = *this; - realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101); - realvec_t y0022 = x0123.fmin(x1032); - return vml_std::fmin(y0022[0], y0022[2]); - } - real_t prod() const - { - // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; - realvec_t x0123 = *this; - realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101); - realvec_t y0022 = x0123 * x1032; - return y0022[0] * y0022[2]; - } - real_t sum() const - { - // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; - // __m256d x = _mm256_hadd_pd(v, v); - // __m128d xlo = _mm256_extractf128_pd(x, 0); - // __m128d xhi = _mm256_extractf128_pd(x, 1); - realvec_t x = *this; - x = _mm256_hadd_pd(x.v, x.v); - return x[0] + x[2]; - } - - - - boolvec_t operator==(realvec_t const& x) const - { - return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ); - } - boolvec_t operator!=(realvec_t const& x) const - { - return _mm256_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here - } - boolvec_t operator<(realvec_t const& x) const - { - return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ); - } - boolvec_t operator<=(realvec_t const& x) const - { - return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ); - } - boolvec_t operator>(realvec_t const& x) const - { - return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ); - } - boolvec_t operator>=(realvec_t const& x) const - { - return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ); - } - - - - realvec_t acos() const { return MF::vml_acos(*this); } - realvec_t acosh() const { return MF::vml_acosh(*this); } - realvec_t asin() const { return MF::vml_asin(*this); } - realvec_t asinh() const { return MF::vml_asinh(*this); } - realvec_t atan() const { return MF::vml_atan(*this); } - realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } - realvec_t atanh() const { return MF::vml_atanh(*this); } - realvec_t cbrt() const { return MF::vml_cbrt(*this); } - realvec_t ceil() const { return _mm256_ceil_pd(v); } - realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); } - realvec_t cos() const { return MF::vml_cos(*this); } - realvec_t cosh() const { return MF::vml_cosh(*this); } - realvec_t exp() const { return MF::vml_exp(*this); } - realvec_t exp10() const { return MF::vml_exp10(*this); } - realvec_t exp2() const { return MF::vml_exp2(*this); } - realvec_t expm1() const { return MF::vml_expm1(*this); } - realvec_t fabs() const { return MF::vml_fabs(*this); } - realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } - realvec_t floor() const { return _mm256_floor_pd(v); } - realvec_t fma(realvec_t y, realvec_t z) const - { - return MF::vml_fma(*this, y, z); } - realvec_t fmax(realvec_t y) const { return _mm256_max_pd(v, y.v); } - realvec_t fmin(realvec_t y) const { return _mm256_min_pd(v, y.v); } - realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); } - realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); } - realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const - { -#ifdef VML_HAVE_NAN - return _mm256_cmp_pd(v, v, _CMP_UNORD_Q); -#else - return BV(false); -#endif - } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec_t log() const { return MF::vml_log(*this); } - realvec_t log10() const { return MF::vml_log10(*this); } - realvec_t log1p() const { return MF::vml_log1p(*this); } - realvec_t log2() const { return MF::vml_log2(*this); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return MF::vml_mad(*this, y, z); - } - realvec_t nextafter(realvec_t y) const - { - return MF::vml_nextafter(*this, y); - } - realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } - realvec_t rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); } - realvec_t remainder(realvec_t y) const - { - return MF::vml_remainder(*this, y); - } - realvec_t rint() const - { - return _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT); - } - realvec_t round() const { return MF::vml_round(*this); } - realvec_t rsqrt() const { return MF::vml_rsqrt(*this); } - boolvec_t signbit() const { return v; } - realvec_t sin() const { return MF::vml_sin(*this); } - realvec_t sinh() const { return MF::vml_sinh(*this); } - realvec_t sqrt() const { return _mm256_sqrt_pd(v); } - realvec_t tan() const { return MF::vml_tan(*this); } - realvec_t tanh() const { return MF::vml_tanh(*this); } - realvec_t trunc() const { return _mm256_round_pd(v, _MM_FROUND_TO_ZERO); } - }; - - - - // boolvec definitions - - inline intvec<double,4> boolvec<double,4>::as_int() const - { - return _mm256_castpd_si256(v); - } - - inline intvec<double,4> boolvec<double,4>::convert_int() const - { - //return ifthen(v, U(1), U(0)); - return lsr(as_int(), bits-1); - } - - inline - boolvec<double,4> boolvec<double,4>::ifthen(boolvec_t x, boolvec_t y) const - { - return ifthen(x.as_int(), y.as_int()).as_bool(); - } - - inline - intvec<double,4> boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const - { - return ifthen(x.as_float(), y.as_float()).as_int(); - } - - inline - realvec<double,4> boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const - { - return _mm256_blendv_pd(y.v, x.v, v); - } - - - - // intvec definitions - - inline intvec<double,4> intvec<double,4>::abs() const - { - return MF::vml_abs(*this); - } - - inline - intvec<double,4> intvec<double,4>::bitifthen(intvec_t x, intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - inline intvec<double,4> intvec<double,4>::clz() const - { - return MF::vml_clz(*this); - } - - inline realvec<double,4> intvec<double,4>::as_float() const - { - return _mm256_castsi256_pd(v); - } - - inline realvec<double,4> intvec<double,4>::convert_float() const - { - realvec_t r; - for (int d=0; d<size; ++d) { - r.set_elt(d, floatprops::convert_float((*this)[d])); + } + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff, m); + storeu(p + ioff, m); + } + + intvec_t as_int() const { return _mm256_castpd_si256(v); } + intvec_t convert_int() const { + intvec_t r; + for (int d = 0; d < size; ++d) { + r.set_elt(d, floatprops::convert_int((*this)[d])); } return r; } - - inline intvec<double,4> intvec<double,4>::max(intvec_t x) const - { - return MF::vml_max(*this, x); + + realvec_t operator+() const { return *this; } + realvec_t operator-() const { return RV(0.0) - *this; } + + realvec_t operator+(realvec_t x) const { return _mm256_add_pd(v, x.v); } + realvec_t operator-(realvec_t x) const { return _mm256_sub_pd(v, x.v); } + realvec_t operator*(realvec_t x) const { return _mm256_mul_pd(v, x.v); } + realvec_t operator/(realvec_t x) const { return _mm256_div_pd(v, x.v); } + + realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; } + realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; } + realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; } + realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; } + + real_t maxval() const { + // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]), + // vml_std::fmax((*this)[2], (*this)[3])); + realvec_t x0123 = *this; + realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101); + realvec_t y0022 = x0123.fmax(x1032); + return vml_std::fmax(y0022[0], y0022[2]); + } + real_t minval() const { + // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]), + // vml_std::fmin((*this)[2], (*this)[3])); + realvec_t x0123 = *this; + realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101); + realvec_t y0022 = x0123.fmin(x1032); + return vml_std::fmin(y0022[0], y0022[2]); + } + real_t prod() const { + // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; + realvec_t x0123 = *this; + realvec_t x1032 = _mm256_shuffle_pd(x0123, x0123, 0b0101); + realvec_t y0022 = x0123 * x1032; + return y0022[0] * y0022[2]; + } + real_t sum() const { + // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; + // __m256d x = _mm256_hadd_pd(v, v); + // __m128d xlo = _mm256_extractf128_pd(x, 0); + // __m128d xhi = _mm256_extractf128_pd(x, 1); + realvec_t x = *this; + x = _mm256_hadd_pd(x.v, x.v); + return x[0] + x[2]; + } + + boolvec_t operator==(realvec_t const &x) const { + return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ); + } + boolvec_t operator!=(realvec_t const &x) const { + return _mm256_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here + } + boolvec_t operator<(realvec_t const &x) const { + return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ); + } + boolvec_t operator<=(realvec_t const &x) const { + return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ); + } + boolvec_t operator>(realvec_t const &x) const { + return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ); } - - inline intvec<double,4> intvec<double,4>::min(intvec_t x) const - { - return MF::vml_min(*this, x); + boolvec_t operator>=(realvec_t const &x) const { + return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ); } - - inline intvec<double,4> intvec<double,4>::popcount() const - { - return MF::vml_popcount(*this); + + realvec_t acos() const { return MF::vml_acos(*this); } + realvec_t acosh() const { return MF::vml_acosh(*this); } + realvec_t asin() const { return MF::vml_asin(*this); } + realvec_t asinh() const { return MF::vml_asinh(*this); } + realvec_t atan() const { return MF::vml_atan(*this); } + realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } + realvec_t atanh() const { return MF::vml_atanh(*this); } + realvec_t cbrt() const { return MF::vml_cbrt(*this); } + realvec_t ceil() const { return _mm256_ceil_pd(v); } + realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); } + realvec_t cos() const { return MF::vml_cos(*this); } + realvec_t cosh() const { return MF::vml_cosh(*this); } + realvec_t exp() const { return MF::vml_exp(*this); } + realvec_t exp10() const { return MF::vml_exp10(*this); } + realvec_t exp2() const { return MF::vml_exp2(*this); } + realvec_t expm1() const { return MF::vml_expm1(*this); } + realvec_t fabs() const { return MF::vml_fabs(*this); } + realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } + realvec_t floor() const { return _mm256_floor_pd(v); } + realvec_t fma(realvec_t y, realvec_t z) const { + return MF::vml_fma(*this, y, z); + } + realvec_t fmax(realvec_t y) const { return _mm256_max_pd(v, y.v); } + realvec_t fmin(realvec_t y) const { return _mm256_min_pd(v, y.v); } + realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); } + realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); } + realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { +#ifdef VML_HAVE_NAN + return _mm256_cmp_pd(v, v, _CMP_UNORD_Q); +#else + return BV(false); +#endif + } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec_t log() const { return MF::vml_log(*this); } + realvec_t log10() const { return MF::vml_log10(*this); } + realvec_t log1p() const { return MF::vml_log1p(*this); } + realvec_t log2() const { return MF::vml_log2(*this); } + realvec_t mad(realvec_t y, realvec_t z) const { + return MF::vml_mad(*this, y, z); } - - inline intvec<double,4> intvec<double,4>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); + realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); } + realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } + realvec_t rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); } + realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); } + realvec_t rint() const { + return _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT); } - - inline intvec<double,4> intvec<double,4>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); + realvec_t round() const { return MF::vml_round(*this); } + realvec_t rsqrt() const { return MF::vml_rsqrt(*this); } + boolvec_t signbit() const { return v; } + realvec_t sin() const { return MF::vml_sin(*this); } + realvec_t sinh() const { return MF::vml_sinh(*this); } + realvec_t sqrt() const { return _mm256_sqrt_pd(v); } + realvec_t tan() const { return MF::vml_tan(*this); } + realvec_t tanh() const { return MF::vml_tanh(*this); } + realvec_t trunc() const { return _mm256_round_pd(v, _MM_FROUND_TO_ZERO); } +}; + +// boolvec definitions + +inline intvec<double, 4> boolvec<double, 4>::as_int() const { + return _mm256_castpd_si256(v); +} + +inline intvec<double, 4> boolvec<double, 4>::convert_int() const { + // return ifthen(v, U(1), U(0)); + return lsr(as_int(), bits - 1); +} + +inline boolvec<double, 4> boolvec<double, 4>::ifthen(boolvec_t x, + boolvec_t y) const { + return ifthen(x.as_int(), y.as_int()).as_bool(); +} + +inline intvec<double, 4> boolvec<double, 4>::ifthen(intvec_t x, + intvec_t y) const { + return ifthen(x.as_float(), y.as_float()).as_int(); +} + +inline realvec<double, 4> boolvec<double, 4>::ifthen(realvec_t x, + realvec_t y) const { + return _mm256_blendv_pd(y.v, x.v, v); +} + +// intvec definitions + +inline intvec<double, 4> intvec<double, 4>::abs() const { + return MF::vml_abs(*this); +} + +inline intvec<double, 4> intvec<double, 4>::bitifthen(intvec_t x, + intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); +} + +inline intvec<double, 4> intvec<double, 4>::clz() const { + return MF::vml_clz(*this); +} + +inline realvec<double, 4> intvec<double, 4>::as_float() const { + return _mm256_castsi256_pd(v); +} + +inline realvec<double, 4> intvec<double, 4>::convert_float() const { + realvec_t r; + for (int d = 0; d < size; ++d) { + r.set_elt(d, floatprops::convert_float((*this)[d])); } - + return r; +} + +inline intvec<double, 4> intvec<double, 4>::max(intvec_t x) const { + return MF::vml_max(*this, x); +} + +inline intvec<double, 4> intvec<double, 4>::min(intvec_t x) const { + return MF::vml_min(*this, x); +} + +inline intvec<double, 4> intvec<double, 4>::popcount() const { + return MF::vml_popcount(*this); +} + +inline intvec<double, 4> intvec<double, 4>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +inline intvec<double, 4> intvec<double, 4>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + } // namespace vecmathlib -#endif // #ifndef VEC_AVX_DOUBLE4_H +#endif // #ifndef VEC_AVX_DOUBLE4_H diff --git a/vec_avx_float8.h b/vec_avx_float8.h index ec1e132..f119aee 100644 --- a/vec_avx_float8.h +++ b/vec_avx_float8.h @@ -12,828 +12,697 @@ // AVX intrinsics #include <immintrin.h> - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_FLOAT_8 - template<> struct boolvec<float,8>; - template<> struct intvec<float,8>; - template<> struct realvec<float,8>; - - - - template<> - struct boolvec<float,8>: floatprops<float> - { - static int const size = 8; - typedef bool scalar_t; - typedef __m256 bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values have the sign bit set, false values have it unset - static uint_t from_bool(bool a) { return - uint_t(a); } - static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } - public: - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): - v(_mm256_castsi256_ps(_mm256_set1_epi32(from_bool(a)))) {} - boolvec(bool const* as): - v(_mm256_castsi256_ps(_mm256_set_epi32(from_bool(as[7]), - from_bool(as[6]), - from_bool(as[5]), - from_bool(as[4]), - from_bool(as[3]), - from_bool(as[2]), - from_bool(as[1]), - from_bool(as[0])))) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const - { - return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n)); - } - boolvec_t& set_elt(int n, bool a) - { - return - vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec_t operator!() const { return _mm256_xor_ps(boolvec(true), v); } - - boolvec_t operator&&(boolvec_t x) const { return _mm256_and_ps(v, x.v); } - boolvec_t operator||(boolvec_t x) const { return _mm256_or_ps(v, x.v); } - boolvec_t operator==(boolvec_t x) const { return !(*this!=x); } - boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_ps(v, x.v); } - - bool all() const - { - // return - // (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] && - // (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7]; - return ! (! *this).any(); - } - bool any() const - { - // return - // (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] || - // (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7]; - return ! bool(_mm256_testz_ps(v, v)); - } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<float,8>: floatprops<float> - { - static int const size = 8; - typedef int_t scalar_t; - typedef __m256i ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm256_set1_epi32(a)) {} - intvec(int_t const* as): v(_mm256_set_epi32(as[7], as[6], as[5], as[4], - as[3], as[2], as[1], as[0])) {} - static intvec_t iota() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); } - - operator ivector_t() const { return v; } - int_t operator[](int n) const - { - return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n); - } - intvec_t& set_elt(int n, int_t a) - { - return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this; - } - - - - boolvec_t as_bool() const { return _mm256_castsi256_ps(v); } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true +template <> struct boolvec<float, 8>; +template <> struct intvec<float, 8>; +template <> struct realvec<float, 8>; + +template <> struct boolvec<float, 8> : floatprops<float> { + static int const size = 8; + typedef bool scalar_t; + typedef __m256 bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + +private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return -uint_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + +public: + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v(_mm256_castsi256_ps(_mm256_set1_epi32(from_bool(a)))) {} + boolvec(bool const *as) + : v(_mm256_castsi256_ps(_mm256_set_epi32( + from_bool(as[7]), from_bool(as[6]), from_bool(as[5]), + from_bool(as[4]), from_bool(as[3]), from_bool(as[2]), + from_bool(as[1]), from_bool(as[0])))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { + return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n)); + } + boolvec_t &set_elt(int n, bool a) { + return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)), + *this; + } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec_t operator!() const { return _mm256_xor_ps(boolvec(true), v); } + + boolvec_t operator&&(boolvec_t x) const { return _mm256_and_ps(v, x.v); } + boolvec_t operator||(boolvec_t x) const { return _mm256_or_ps(v, x.v); } + boolvec_t operator==(boolvec_t x) const { return !(*this != x); } + boolvec_t operator!=(boolvec_t x) const { return _mm256_xor_ps(v, x.v); } + + bool all() const { + // return + // (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] && + // (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7]; + return !(!*this).any(); + } + bool any() const { + // return + // (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] || + // (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7]; + return !bool(_mm256_testz_ps(v, v)); + } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<float, 8> : floatprops<float> { + static int const size = 8; + typedef int_t scalar_t; + typedef __m256i ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x) : v(x) {} + intvec(int_t a) : v(_mm256_set1_epi32(a)) {} + intvec(int_t const *as) + : v(_mm256_set_epi32(as[7], as[6], as[5], as[4], as[3], as[2], as[1], + as[0])) {} + static intvec_t iota() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { + return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n); + } + intvec_t &set_elt(int n, int_t a) { + return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this; + } + + boolvec_t as_bool() const { return _mm256_castsi256_ps(v); } + boolvec_t convert_bool() const { +// Result: convert_bool(0)=false, convert_bool(else)=true #ifdef __AVX2__ - return *this != IV(I(0)); + return *this != IV(I(0)); #else - // There is no intrinsic to compare to zero. Instead, we check - // whether x is positive and x-1 is negative. - intvec_t x = *this; - // We know that boolvec_t values depend only on the sign bit - // return (~(x-1) | x).as_bool(); - // return x.as_bool() || !(x-1).as_bool(); - return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); + // There is no intrinsic to compare to zero. Instead, we check + // whether x is positive and x-1 is negative. + intvec_t x = *this; + // We know that boolvec_t values depend only on the sign bit + // return (~(x-1) | x).as_bool(); + // return x.as_bool() || !(x-1).as_bool(); + return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); #endif - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec_t operator+() const { return *this; } - intvec_t operator-() const { return IV(0) - *this; } - - intvec_t operator+(intvec_t x) const - { + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + // Note: not all arithmetic operations are supported! + + intvec_t operator+() const { return *this; } + intvec_t operator-() const { return IV(0) - *this; } + + intvec_t operator+(intvec_t x) const { #ifdef __AVX2__ - return _mm256_add_epi32(v, x.v); + return _mm256_add_epi32(v, x.v); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_add_epi32(vlo, xvlo); - vhi = _mm_add_epi32(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_add_epi32(vlo, xvlo); + vhi = _mm_add_epi32(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec_t operator-(intvec_t x) const - { + } + intvec_t operator-(intvec_t x) const { #ifdef __AVX2__ - return _mm256_sub_epi32(v, x.v); + return _mm256_sub_epi32(v, x.v); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_sub_epi32(vlo, xvlo); - vhi = _mm_sub_epi32(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_sub_epi32(vlo, xvlo); + vhi = _mm_sub_epi32(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - - intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; } - intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; } - - - - intvec_t operator~() const { return IV(~U(0)) ^ *this; } - - intvec_t operator&(intvec_t x) const - { + } + + intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; } + intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; } + + intvec_t operator~() const { return IV(~U(0)) ^ *this; } + + intvec_t operator&(intvec_t x) const { #ifdef __AVX2__ - return _mm256_and_si256(v, x.v); + return _mm256_and_si256(v, x.v); #else - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); + return _mm256_castps_si256( + _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); #endif - } - intvec_t operator|(intvec_t x) const - { + } + intvec_t operator|(intvec_t x) const { #ifdef __AVX2__ - return _mm256_or_si256(v, x.v); + return _mm256_or_si256(v, x.v); #else - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); + return _mm256_castps_si256( + _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); #endif - } - intvec_t operator^(intvec_t x) const - { + } + intvec_t operator^(intvec_t x) const { #ifdef __AVX2__ - return _mm256_xor_si256(v, x.v); + return _mm256_xor_si256(v, x.v); #else - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); + return _mm256_castps_si256( + _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); #endif - } - - intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; } - intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; } - intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const; - - - - intvec_t lsr(int_t n) const - { + } + + intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; } + intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; } + intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const; + + intvec_t lsr(int_t n) const { #ifdef __AVX2__ - return _mm256_srli_epi32(v, n); + return _mm256_srli_epi32(v, n); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_srli_epi32(vlo, n); - vhi = _mm_srli_epi32(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_srli_epi32(vlo, n); + vhi = _mm_srli_epi32(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec_t rotate(int_t n) const; - intvec_t operator>>(int_t n) const - { + } + intvec_t rotate(int_t n) const; + intvec_t operator>>(int_t n) const { #ifdef __AVX2__ - return _mm256_srai_epi32(v, n); + return _mm256_srai_epi32(v, n); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_srai_epi32(vlo, n); - vhi = _mm_srai_epi32(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_srai_epi32(vlo, n); + vhi = _mm_srai_epi32(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec_t operator<<(int_t n) const - { + } + intvec_t operator<<(int_t n) const { #ifdef __AVX2__ - return _mm256_slli_epi32(v, n); + return _mm256_slli_epi32(v, n); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_slli_epi32(vlo, n); - vhi = _mm_slli_epi32(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_slli_epi32(vlo, n); + vhi = _mm_slli_epi32(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec_t& operator>>=(int_t n) { return *this=*this>>n; } - intvec_t& operator<<=(int_t n) { return *this=*this<<n; } - - intvec_t lsr(intvec_t n) const - { + } + intvec_t &operator>>=(int_t n) { return *this = *this >> n; } + intvec_t &operator<<=(int_t n) { return *this = *this << n; } + + intvec_t lsr(intvec_t n) const { #ifdef __AVX2__ - return _mm256_srlv_epi32(v, n.v); + return _mm256_srlv_epi32(v, n.v); #else - intvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, U((*this)[i]) >> U(n[i])); - } - return r; -#endif + intvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, U((*this)[i]) >> U(n[i])); } - intvec_t rotate(intvec_t n) const; - intvec_t operator>>(intvec_t n) const - { + return r; +#endif + } + intvec_t rotate(intvec_t n) const; + intvec_t operator>>(intvec_t n) const { #ifdef __AVX2__ - return _mm256_srav_epi32(v, n.v); + return _mm256_srav_epi32(v, n.v); #else - intvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] >> n[i]); - } - return r; -#endif + intvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] >> n[i]); } - intvec_t operator<<(intvec_t n) const - { + return r; +#endif + } + intvec_t operator<<(intvec_t n) const { #ifdef __AVX2__ - return _mm256_sllv_epi32(v, n.v); + return _mm256_sllv_epi32(v, n.v); #else - intvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] << n[i]); - } - return r; -#endif + intvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] << n[i]); } - intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; } - intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; } - - intvec_t clz() const; - intvec_t popcount() const; - - - - boolvec_t operator==(intvec_t const& x) const - { + return r; +#endif + } + intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; } + intvec_t &operator<<=(intvec_t n) { return *this = *this << n; } + + intvec_t clz() const; + intvec_t popcount() const; + + boolvec_t operator==(intvec_t const &x) const { #ifdef __AVX2__ - return _mm256_castsi256_ps(_mm256_cmpeq_epi32(v, x.v)); + return _mm256_castsi256_ps(_mm256_cmpeq_epi32(v, x.v)); #else - return ! (*this != x); + return !(*this != x); #endif - } - boolvec_t operator!=(intvec_t const& x) const - { + } + boolvec_t operator!=(intvec_t const &x) const { #ifdef __AVX2__ - return ! (*this == x); + return !(*this == x); #else - return (*this ^ x).convert_bool(); + return (*this ^ x).convert_bool(); #endif - } - boolvec_t operator<(intvec_t const& x) const - { + } + boolvec_t operator<(intvec_t const &x) const { #ifdef __AVX2__ - return _mm256_castsi256_ps(_mm256_cmpgt_epi32(x.v, v)); + return _mm256_castsi256_ps(_mm256_cmpgt_epi32(x.v, v)); #else - // return (*this - x).as_bool(); - boolvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] < x[i]); - } - return r; -#endif + // return (*this - x).as_bool(); + boolvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] < x[i]); } - boolvec_t operator<=(intvec_t const& x) const - { - return ! (*this > x); - } - boolvec_t operator>(intvec_t const& x) const - { - return x < *this; - } - boolvec_t operator>=(intvec_t const& x) const - { - return ! (*this < x); - } - - intvec_t abs() const; - boolvec_t isignbit() const { return as_bool(); } - intvec_t max(intvec_t x) const; - intvec_t min(intvec_t x) const; - }; - - - - template<> - struct realvec<float,8>: floatprops<float> - { - static int const size = 8; - typedef real_t scalar_t; - typedef __m256 vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { + return r; +#endif + } + boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); } + boolvec_t operator>(intvec_t const &x) const { return x < *this; } + boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); } + + intvec_t abs() const; + boolvec_t isignbit() const { return as_bool(); } + intvec_t max(intvec_t x) const; + intvec_t min(intvec_t x) const; +}; + +template <> struct realvec<float, 8> : floatprops<float> { + static int const size = 8; + typedef real_t scalar_t; + typedef __m256 vector_t; + static int const alignment = sizeof(vector_t); + + static char const *name() { #ifdef __AVX2__ - return "<AVX2:8*float>"; + return "<AVX2:8*float>"; #else - return "<AVX:8*float>"; + return "<AVX:8*float>"; #endif + } + void barrier() { __asm__("" : "+x"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x) : v(x) {} + realvec(real_t a) : v(_mm256_set1_ps(a)) {} + realvec(real_t const *as) + : v(_mm256_set_ps(as[7], as[6], as[5], as[4], as[3], as[2], as[1], + as[0])) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { + return vecmathlib::get_elt<RV, vector_t, real_t>(v, n); + } + realvec_t &set_elt(int n, real_t a) { + return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this; + } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm256_load_ps(p); + } + static realvec_t loadu(real_t const *p) { return _mm256_loadu_ps(p); } + static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff); + return loadu(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); } - void barrier() { __asm__("": "+x"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm256_set1_ps(a)) {} - realvec(real_t const* as): v(_mm256_set_ps(as[7], as[6], as[5], as[4], - as[3], as[2], as[1], as[0])) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const - { - return vecmathlib::get_elt<RV,vector_t,real_t>(v, n); - } - realvec_t& set_elt(int n, real_t a) - { - return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this; - } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm256_load_ps(p); - } - static realvec_t loadu(real_t const* p) - { - return _mm256_loadu_ps(p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm256_store_ps(p, v); - } - void storeu(real_t* p) const - { - return _mm256_storeu_ps(p, v); - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - _mm256_maskstore_ps(p, m.m.as_int(), v); - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - // TODO: this is expensive - for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return _mm256_castps_si256(v); } - intvec_t convert_int() const { return _mm256_cvttps_epi32(v); } - - - - realvec_t operator+() const { return *this; } - realvec_t operator-() const { return RV(0.0) - *this; } - - realvec_t operator+(realvec_t x) const { return _mm256_add_ps(v, x.v); } - realvec_t operator-(realvec_t x) const { return _mm256_sub_ps(v, x.v); } - realvec_t operator*(realvec_t x) const { return _mm256_mul_ps(v, x.v); } - realvec_t operator/(realvec_t x) const { return _mm256_div_ps(v, x.v); } - - realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; } - realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; } - realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; } - realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; } - - real_t maxval() const - { - // return - // vml_std::fmax(vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]), - // vml_std::fmax((*this)[2], (*this)[3])), - // vml_std::fmax(vml_std::fmax((*this)[4], (*this)[5]), - // vml_std::fmax((*this)[6], (*this)[7]))); - realvec_t x01234567 = *this; - realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001); - realvec_t y00224466 = x01234567.fmax(x10325476); - realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110); - realvec_t z00004444 = y00224466.fmax(y22006644); - return vml_std::fmax(z00004444[0], z00004444[4]); - } - real_t minval() const - { - // return - // vml_std::fmin(vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]), - // vml_std::fmin((*this)[2], (*this)[3])), - // vml_std::fmin(vml_std::fmin((*this)[4], (*this)[5]), - // vml_std::fmin((*this)[6], (*this)[7]))); - realvec_t x01234567 = *this; - realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001); - realvec_t y00224466 = x01234567.fmin(x10325476); - realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110); - realvec_t z00004444 = y00224466.fmin(y22006644); - return vml_std::fmin(z00004444[0], z00004444[4]); - } - real_t prod() const - { - // return - // (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] * - // (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7]; - realvec_t x01234567 = *this; - realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001); - realvec_t y00224466 = x01234567 * x10325476; - realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110); - realvec_t z00004444 = y00224466 * y22006644; - return z00004444[0] * z00004444[4]; - } - real_t sum() const - { - // return - // (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3] + - // (*this)[4] + (*this)[5] + (*this)[6] + (*this)[7]; - // _m256 x = vhaddps(v, v); - // x = vhaddps(x, x); - // __m128 xlo = _mm256_extractf128_ps(x, 0); - // __m128 xhi = _mm256_extractf128_ps(x, 1); - // return _mm_cvtsd_f64(xlo) + _mm_cvtsd_f64(xhi); - realvec_t x = *this; - x = _mm256_hadd_ps(x.v, x.v); - x = _mm256_hadd_ps(x.v, x.v); - return x[0] + x[4]; - } - - - - boolvec_t operator==(realvec_t const& x) const - { - return _mm256_cmp_ps(v, x.v, _CMP_EQ_OQ); - } - boolvec_t operator!=(realvec_t const& x) const - { - return _mm256_cmp_ps(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here - } - boolvec_t operator<(realvec_t const& x) const - { - return _mm256_cmp_ps(v, x.v, _CMP_LT_OQ); - } - boolvec_t operator<=(realvec_t const& x) const - { - return _mm256_cmp_ps(v, x.v, _CMP_LE_OQ); - } - boolvec_t operator>(realvec_t const& x) const - { - return _mm256_cmp_ps(v, x.v, _CMP_GT_OQ); + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); } - boolvec_t operator>=(realvec_t const& x) const - { - return _mm256_cmp_ps(v, x.v, _CMP_GE_OQ); + } + realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff, m); + return loadu(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm256_store_ps(p, v); + } + void storeu(real_t *p) const { return _mm256_storeu_ps(p, v); } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff); + storeu(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + _mm256_maskstore_ps(p, m.m.as_int(), v); } - - - - realvec_t acos() const { return MF::vml_acos(*this); } - realvec_t acosh() const { return MF::vml_acosh(*this); } - realvec_t asin() const { return MF::vml_asin(*this); } - realvec_t asinh() const { return MF::vml_asinh(*this); } - realvec_t atan() const { return MF::vml_atan(*this); } - realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } - realvec_t atanh() const { return MF::vml_atanh(*this); } - realvec_t cbrt() const { return MF::vml_cbrt(*this); } - realvec_t ceil() const { return _mm256_ceil_ps(v); } - realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); } - realvec_t cos() const { return MF::vml_cos(*this); } - realvec_t cosh() const { return MF::vml_cosh(*this); } - realvec_t exp() const { return MF::vml_exp(*this); } - realvec_t exp10() const { return MF::vml_exp10(*this); } - realvec_t exp2() const { return MF::vml_exp2(*this); } - realvec_t expm1() const { return MF::vml_expm1(*this); } - realvec_t fabs() const { return MF::vml_fabs(*this); } - realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } - realvec_t floor() const { return _mm256_floor_ps(v); } - realvec_t fma(realvec_t y, realvec_t z) const - { - return MF::vml_fma(*this, y, z); + } + void storeu(real_t *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + // TODO: this is expensive + for (int n = 0; n < size; ++n) + if (m.m[n]) + p[n] = (*this)[n]; } - realvec_t fmax(realvec_t y) const { return _mm256_max_ps(v, y.v); } - realvec_t fmin(realvec_t y) const { return _mm256_min_ps(v, y.v); } - realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); } - realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); } - realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const - { + } + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff, m); + storeu(p + ioff, m); + } + + intvec_t as_int() const { return _mm256_castps_si256(v); } + intvec_t convert_int() const { return _mm256_cvttps_epi32(v); } + + realvec_t operator+() const { return *this; } + realvec_t operator-() const { return RV(0.0) - *this; } + + realvec_t operator+(realvec_t x) const { return _mm256_add_ps(v, x.v); } + realvec_t operator-(realvec_t x) const { return _mm256_sub_ps(v, x.v); } + realvec_t operator*(realvec_t x) const { return _mm256_mul_ps(v, x.v); } + realvec_t operator/(realvec_t x) const { return _mm256_div_ps(v, x.v); } + + realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; } + realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; } + realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; } + realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; } + + real_t maxval() const { + // return + // vml_std::fmax(vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]), + // vml_std::fmax((*this)[2], (*this)[3])), + // vml_std::fmax(vml_std::fmax((*this)[4], (*this)[5]), + // vml_std::fmax((*this)[6], (*this)[7]))); + realvec_t x01234567 = *this; + realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001); + realvec_t y00224466 = x01234567.fmax(x10325476); + realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110); + realvec_t z00004444 = y00224466.fmax(y22006644); + return vml_std::fmax(z00004444[0], z00004444[4]); + } + real_t minval() const { + // return + // vml_std::fmin(vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]), + // vml_std::fmin((*this)[2], (*this)[3])), + // vml_std::fmin(vml_std::fmin((*this)[4], (*this)[5]), + // vml_std::fmin((*this)[6], (*this)[7]))); + realvec_t x01234567 = *this; + realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001); + realvec_t y00224466 = x01234567.fmin(x10325476); + realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110); + realvec_t z00004444 = y00224466.fmin(y22006644); + return vml_std::fmin(z00004444[0], z00004444[4]); + } + real_t prod() const { + // return + // (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3] * + // (*this)[4] * (*this)[5] * (*this)[6] * (*this)[7]; + realvec_t x01234567 = *this; + realvec_t x10325476 = _mm256_shuffle_ps(x01234567, x01234567, 0b10110001); + realvec_t y00224466 = x01234567 * x10325476; + realvec_t y22006644 = _mm256_shuffle_ps(y00224466, y00224466, 0b01001110); + realvec_t z00004444 = y00224466 * y22006644; + return z00004444[0] * z00004444[4]; + } + real_t sum() const { + // return + // (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3] + + // (*this)[4] + (*this)[5] + (*this)[6] + (*this)[7]; + // _m256 x = vhaddps(v, v); + // x = vhaddps(x, x); + // __m128 xlo = _mm256_extractf128_ps(x, 0); + // __m128 xhi = _mm256_extractf128_ps(x, 1); + // return _mm_cvtsd_f64(xlo) + _mm_cvtsd_f64(xhi); + realvec_t x = *this; + x = _mm256_hadd_ps(x.v, x.v); + x = _mm256_hadd_ps(x.v, x.v); + return x[0] + x[4]; + } + + boolvec_t operator==(realvec_t const &x) const { + return _mm256_cmp_ps(v, x.v, _CMP_EQ_OQ); + } + boolvec_t operator!=(realvec_t const &x) const { + return _mm256_cmp_ps(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here + } + boolvec_t operator<(realvec_t const &x) const { + return _mm256_cmp_ps(v, x.v, _CMP_LT_OQ); + } + boolvec_t operator<=(realvec_t const &x) const { + return _mm256_cmp_ps(v, x.v, _CMP_LE_OQ); + } + boolvec_t operator>(realvec_t const &x) const { + return _mm256_cmp_ps(v, x.v, _CMP_GT_OQ); + } + boolvec_t operator>=(realvec_t const &x) const { + return _mm256_cmp_ps(v, x.v, _CMP_GE_OQ); + } + + realvec_t acos() const { return MF::vml_acos(*this); } + realvec_t acosh() const { return MF::vml_acosh(*this); } + realvec_t asin() const { return MF::vml_asin(*this); } + realvec_t asinh() const { return MF::vml_asinh(*this); } + realvec_t atan() const { return MF::vml_atan(*this); } + realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } + realvec_t atanh() const { return MF::vml_atanh(*this); } + realvec_t cbrt() const { return MF::vml_cbrt(*this); } + realvec_t ceil() const { return _mm256_ceil_ps(v); } + realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); } + realvec_t cos() const { return MF::vml_cos(*this); } + realvec_t cosh() const { return MF::vml_cosh(*this); } + realvec_t exp() const { return MF::vml_exp(*this); } + realvec_t exp10() const { return MF::vml_exp10(*this); } + realvec_t exp2() const { return MF::vml_exp2(*this); } + realvec_t expm1() const { return MF::vml_expm1(*this); } + realvec_t fabs() const { return MF::vml_fabs(*this); } + realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } + realvec_t floor() const { return _mm256_floor_ps(v); } + realvec_t fma(realvec_t y, realvec_t z) const { + return MF::vml_fma(*this, y, z); + } + realvec_t fmax(realvec_t y) const { return _mm256_max_ps(v, y.v); } + realvec_t fmin(realvec_t y) const { return _mm256_min_ps(v, y.v); } + realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); } + realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); } + realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { #ifdef VML_HAVE_NAN - return _mm256_cmp_ps(v, v, _CMP_UNORD_Q); + return _mm256_cmp_ps(v, v, _CMP_UNORD_Q); #else - return BV(false); + return BV(false); #endif - } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec_t log() const { return MF::vml_log(*this); } - realvec_t log10() const { return MF::vml_log10(*this); } - realvec_t log1p() const { return MF::vml_log1p(*this); } - realvec_t log2() const { return MF::vml_log2(*this); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return MF::vml_mad(*this, y, z); - } - realvec_t nextafter(realvec_t y) const - { - return MF::vml_nextafter(*this, y); - } - realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } - realvec_t rcp() const - { - realvec_t x = *this; - realvec_t r = _mm256_rcp_ps(x); // this is only an approximation - r *= RV(2.0) - r*x; // one Newton iteration (see vml_rcp) - return r; - } - realvec_t remainder(realvec_t y) const - { - return MF::vml_remainder(*this, y); - } - realvec_t rint() const - { - return _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT); - } - realvec_t round() const { return MF::vml_round(*this); } - realvec_t rsqrt() const - { - realvec_t x = *this; - realvec_t r = _mm256_rsqrt_ps(x); // this is only an approximation - r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt) - return r; - } - boolvec_t signbit() const { return v; } - realvec_t sin() const { return MF::vml_sin(*this); } - realvec_t sinh() const { return MF::vml_sinh(*this); } - realvec_t sqrt() const { return _mm256_sqrt_ps(v); } - realvec_t tan() const { return MF::vml_tan(*this); } - realvec_t tanh() const { return MF::vml_tanh(*this); } - realvec_t trunc() const { return _mm256_round_ps(v, _MM_FROUND_TO_ZERO); } - }; - - - - // boolvec definitions - - inline intvec<float,8> boolvec<float,8>::as_int() const - { - return _mm256_castps_si256(v); - } - - inline intvec<float,8> boolvec<float,8>::convert_int() const - { - return lsr(as_int(), bits-1); - } - - inline - boolvec<float,8> boolvec<float,8>::ifthen(boolvec_t x, boolvec_t y) const - { - return ifthen(x.as_int(), y.as_int()).as_bool(); - } - - inline intvec<float,8> boolvec<float,8>::ifthen(intvec_t x, intvec_t y) const - { - return ifthen(x.as_float(), y.as_float()).as_int(); - } - - inline - realvec<float,8> boolvec<float,8>::ifthen(realvec_t x, realvec_t y) const - { - return _mm256_blendv_ps(y.v, x.v, v); - } - - - - // intvec definitions - - inline intvec<float,8> intvec<float,8>::abs() const - { + } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec_t log() const { return MF::vml_log(*this); } + realvec_t log10() const { return MF::vml_log10(*this); } + realvec_t log1p() const { return MF::vml_log1p(*this); } + realvec_t log2() const { return MF::vml_log2(*this); } + realvec_t mad(realvec_t y, realvec_t z) const { + return MF::vml_mad(*this, y, z); + } + realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); } + realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } + realvec_t rcp() const { + realvec_t x = *this; + realvec_t r = _mm256_rcp_ps(x); // this is only an approximation + r *= RV(2.0) - r * x; // one Newton iteration (see vml_rcp) + return r; + } + realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); } + realvec_t rint() const { + return _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT); + } + realvec_t round() const { return MF::vml_round(*this); } + realvec_t rsqrt() const { + realvec_t x = *this; + realvec_t r = _mm256_rsqrt_ps(x); // this is only an approximation + r *= RV(1.5) - RV(0.5) * x * r * r; // one Newton iteration (see vml_rsqrt) + return r; + } + boolvec_t signbit() const { return v; } + realvec_t sin() const { return MF::vml_sin(*this); } + realvec_t sinh() const { return MF::vml_sinh(*this); } + realvec_t sqrt() const { return _mm256_sqrt_ps(v); } + realvec_t tan() const { return MF::vml_tan(*this); } + realvec_t tanh() const { return MF::vml_tanh(*this); } + realvec_t trunc() const { return _mm256_round_ps(v, _MM_FROUND_TO_ZERO); } +}; + +// boolvec definitions + +inline intvec<float, 8> boolvec<float, 8>::as_int() const { + return _mm256_castps_si256(v); +} + +inline intvec<float, 8> boolvec<float, 8>::convert_int() const { + return lsr(as_int(), bits - 1); +} + +inline boolvec<float, 8> boolvec<float, 8>::ifthen(boolvec_t x, + boolvec_t y) const { + return ifthen(x.as_int(), y.as_int()).as_bool(); +} + +inline intvec<float, 8> boolvec<float, 8>::ifthen(intvec_t x, + intvec_t y) const { + return ifthen(x.as_float(), y.as_float()).as_int(); +} + +inline realvec<float, 8> boolvec<float, 8>::ifthen(realvec_t x, + realvec_t y) const { + return _mm256_blendv_ps(y.v, x.v, v); +} + +// intvec definitions + +inline intvec<float, 8> intvec<float, 8>::abs() const { #ifdef __AVX2__ - return _mm256_abs_epi32(v); + return _mm256_abs_epi32(v); #else - return MF::vml_abs(*this); + return MF::vml_abs(*this); #endif - } - - inline realvec<float,8> intvec<float,8>::as_float() const - { - return _mm256_castsi256_ps(v); - } - - inline intvec<float,8> intvec<float,8>::bitifthen(intvec_t x, - intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - inline intvec<float,8> intvec<float,8>::clz() const - { - return MF::vml_clz(*this); - } - - inline realvec<float,8> intvec<float,8>::convert_float() const - { - return _mm256_cvtepi32_ps(v); - } - - inline intvec<float,8> intvec<float,8>::max(intvec_t x) const - { - return MF::vml_max(*this, x); - } - - inline intvec<float,8> intvec<float,8>::min(intvec_t x) const - { - return MF::vml_min(*this, x); - } - - inline intvec<float,8> intvec<float,8>::popcount() const - { - return MF::vml_popcount(*this); - } - - inline intvec<float,8> intvec<float,8>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); - } - - inline intvec<float,8> intvec<float,8>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); - } - +} + +inline realvec<float, 8> intvec<float, 8>::as_float() const { + return _mm256_castsi256_ps(v); +} + +inline intvec<float, 8> intvec<float, 8>::bitifthen(intvec_t x, + intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); +} + +inline intvec<float, 8> intvec<float, 8>::clz() const { + return MF::vml_clz(*this); +} + +inline realvec<float, 8> intvec<float, 8>::convert_float() const { + return _mm256_cvtepi32_ps(v); +} + +inline intvec<float, 8> intvec<float, 8>::max(intvec_t x) const { + return MF::vml_max(*this, x); +} + +inline intvec<float, 8> intvec<float, 8>::min(intvec_t x) const { + return MF::vml_min(*this, x); +} + +inline intvec<float, 8> intvec<float, 8>::popcount() const { + return MF::vml_popcount(*this); +} + +inline intvec<float, 8> intvec<float, 8>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +inline intvec<float, 8> intvec<float, 8>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + } // namespace vecmathlib -#endif // #ifndef VEC_AVX_FLOAT8_H +#endif // #ifndef VEC_AVX_FLOAT8_H diff --git a/vec_avx_fp16_16.h b/vec_avx_fp16_16.h index ddade85..6af27e5 100644 --- a/vec_avx_fp16_16.h +++ b/vec_avx_fp16_16.h @@ -12,378 +12,309 @@ // AVX intrinsics #include <immintrin.h> - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_FP16_16 - template<> struct boolvec<fp16,16>; - template<> struct intvec<fp16,16>; - template<> struct realvec<fp16,16>; - - - - template<> - struct boolvec<fp16,16>: floatprops<fp16> - { - static int const size = 16; - typedef bool scalar_t; - typedef __m256i bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values have the sign bit set, false values have it unset - static uint_t from_bool(bool a) { return - uint_t(a); } - static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } - public: - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(_mm256_set1_epi16(from_bool(a))) {} - boolvec(bool const* as): - v(_mm256_set_epi16(from_bool(as[15]), - from_bool(as[14]), - from_bool(as[13]), - from_bool(as[12]), - from_bool(as[11]), - from_bool(as[10]), - from_bool(as[ 9]), - from_bool(as[ 8]), - from_bool(as[ 7]), - from_bool(as[ 6]), - from_bool(as[ 5]), - from_bool(as[ 4]), - from_bool(as[ 3]), - from_bool(as[ 2]), - from_bool(as[ 1]), - from_bool(as[ 0]))) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const - { - return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n)); - } - boolvec& set_elt(int n, bool a) - { - return - vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return *this != boolvec(true); } - - boolvec operator&&(boolvec x) const - { - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - boolvec operator||(boolvec x) const - { - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - boolvec operator==(boolvec x) const { return !(*this!=x); } - boolvec operator!=(boolvec x) const - { - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - - bool all() const - { - bool r = (*this)[0]; - for (int n=1; n<size; ++n) r = r && (*this)[n]; - return r; - } - bool any() const - { - bool r = (*this)[0];; - for (int n=1; n<size; ++n) r = r || (*this)[n]; - return r; - } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<fp16,16>: floatprops<fp16> - { - static int const size = 16; - typedef int_t scalar_t; - typedef __m256i ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm256_set1_epi16(a)) {} - intvec(int_t const* as): - v(_mm256_set_epi16(as[15], - as[14], - as[13], - as[12], - as[11], - as[10], - as[ 9], - as[ 8], - as[ 7], - as[ 6], - as[ 5], - as[ 4], - as[ 3], - as[ 2], - as[ 1], - as[ 0])) {} - static intvec iota() - { - return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, - 7, 6, 5, 4, 3, 2, 1, 0); - } - - operator ivector_t() const { return v; } - int_t operator[](int n) const - { - return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n); - } - intvec_t& set_elt(int n, int_t a) - { - return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this; - } - - - - boolvec_t as_bool() const { return v; } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare to zero. Instead, we check - // whether x is positive and x-1 is negative. - intvec x = *this; - // We know that boolvec values depend only on the sign bit - // return (~(x-1) | x).as_bool(); - // return x.as_bool() || !(x-1).as_bool(); - return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec operator+() const { return *this; } - intvec operator-() const { return IV(I(0)) - *this; } - - intvec operator+(intvec x) const - { +template <> struct boolvec<fp16, 16>; +template <> struct intvec<fp16, 16>; +template <> struct realvec<fp16, 16>; + +template <> struct boolvec<fp16, 16> : floatprops<fp16> { + static int const size = 16; + typedef bool scalar_t; + typedef __m256i bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + +private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return -uint_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + +public: + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v(_mm256_set1_epi16(from_bool(a))) {} + boolvec(bool const *as) + : v(_mm256_set_epi16(from_bool(as[15]), from_bool(as[14]), + from_bool(as[13]), from_bool(as[12]), + from_bool(as[11]), from_bool(as[10]), + from_bool(as[9]), from_bool(as[8]), from_bool(as[7]), + from_bool(as[6]), from_bool(as[5]), from_bool(as[4]), + from_bool(as[3]), from_bool(as[2]), from_bool(as[1]), + from_bool(as[0]))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { + return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n)); + } + boolvec &set_elt(int n, bool a) { + return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)), + *this; + } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec operator!() const { return *this != boolvec(true); } + + boolvec operator&&(boolvec x) const { + return _mm256_castps_si256( + _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); + } + boolvec operator||(boolvec x) const { + return _mm256_castps_si256( + _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); + } + boolvec operator==(boolvec x) const { return !(*this != x); } + boolvec operator!=(boolvec x) const { + return _mm256_castps_si256( + _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); + } + + bool all() const { + bool r = (*this)[0]; + for (int n = 1; n < size; ++n) + r = r && (*this)[n]; + return r; + } + bool any() const { + bool r = (*this)[0]; + ; + for (int n = 1; n < size; ++n) + r = r || (*this)[n]; + return r; + } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<fp16, 16> : floatprops<fp16> { + static int const size = 16; + typedef int_t scalar_t; + typedef __m256i ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x) : v(x) {} + intvec(int_t a) : v(_mm256_set1_epi16(a)) {} + intvec(int_t const *as) + : v(_mm256_set_epi16(as[15], as[14], as[13], as[12], as[11], as[10], + as[9], as[8], as[7], as[6], as[5], as[4], as[3], + as[2], as[1], as[0])) {} + static intvec iota() { + return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + 0); + } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { + return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n); + } + intvec_t &set_elt(int n, int_t a) { + return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this; + } + + boolvec_t as_bool() const { return v; } + boolvec_t convert_bool() const { + // Result: convert_bool(0)=false, convert_bool(else)=true + // There is no intrinsic to compare to zero. Instead, we check + // whether x is positive and x-1 is negative. + intvec x = *this; + // We know that boolvec values depend only on the sign bit + // return (~(x-1) | x).as_bool(); + // return x.as_bool() || !(x-1).as_bool(); + return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + // Note: not all arithmetic operations are supported! + + intvec operator+() const { return *this; } + intvec operator-() const { return IV(I(0)) - *this; } + + intvec operator+(intvec x) const { #ifdef __AVX2__ - return _mm256_add_epi16(v, x.v); + return _mm256_add_epi16(v, x.v); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_add_epi16(vlo, xvlo); - vhi = _mm_add_epi16(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_add_epi16(vlo, xvlo); + vhi = _mm_add_epi16(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec operator-(intvec x) const - { + } + intvec operator-(intvec x) const { #ifdef __AVX2__ - return _mm256_sub_epi16(v, x.v); + return _mm256_sub_epi16(v, x.v); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_sub_epi16(vlo, xvlo); - vhi = _mm_sub_epi16(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_sub_epi16(vlo, xvlo); + vhi = _mm_sub_epi16(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - - - - intvec operator~() const { return IV(~U(0)) ^ *this; } - - intvec operator&(intvec x) const - { + } + + intvec &operator+=(intvec const &x) { return *this = *this + x; } + intvec &operator-=(intvec const &x) { return *this = *this - x; } + + intvec operator~() const { return IV(~U(0)) ^ *this; } + + intvec operator&(intvec x) const { #ifdef __AVX2__ - return _mm256_and_si256(v, x.v); + return _mm256_and_si256(v, x.v); #else - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); + return _mm256_castps_si256( + _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); #endif - } - intvec operator|(intvec x) const - { + } + intvec operator|(intvec x) const { #ifdef __AVX2__ - return _mm256_or_si256(v, x.v); + return _mm256_or_si256(v, x.v); #else - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); + return _mm256_castps_si256( + _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); #endif - } - intvec operator^(intvec x) const - { + } + intvec operator^(intvec x) const { #ifdef __AVX2__ - return _mm256_xor_si256(v, x.v); + return _mm256_xor_si256(v, x.v); #else - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); + return _mm256_castps_si256( + _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); #endif - } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - - - intvec lsr(int_t n) const - { + } + + intvec &operator&=(intvec const &x) { return *this = *this & x; } + intvec &operator|=(intvec const &x) { return *this = *this | x; } + intvec &operator^=(intvec const &x) { return *this = *this ^ x; } + + intvec lsr(int_t n) const { #ifdef __AVX2__ - return _mm256_srli_epi16(v, n); + return _mm256_srli_epi16(v, n); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_srli_epi16(vlo, n); - vhi = _mm_srli_epi16(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_srli_epi16(vlo, n); + vhi = _mm_srli_epi16(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec operator>>(int_t n) const - { + } + intvec operator>>(int_t n) const { #ifdef __AVX2__ - return _mm256_srai_epi16(v, n); + return _mm256_srai_epi16(v, n); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_srai_epi16(vlo, n); - vhi = _mm_srai_epi16(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_srai_epi16(vlo, n); + vhi = _mm_srai_epi16(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec operator<<(int_t n) const - { + } + intvec operator<<(int_t n) const { #ifdef __AVX2__ - return _mm256_slli_epi16(v, n); + return _mm256_slli_epi16(v, n); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_slli_epi16(vlo, n); - vhi = _mm_slli_epi16(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_slli_epi16(vlo, n); + vhi = _mm_slli_epi16(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<<n; } - - intvec lsr(intvec n) const - { + } + intvec &operator>>=(int_t n) { return *this = *this >> n; } + intvec &operator<<=(int_t n) { return *this = *this << n; } + + intvec lsr(intvec n) const { #ifdef __AVX2__ - // TODO: Use permute instead of shift/mask? - _mm256i mlo = _mm256_set1_epi32(U(0x0000ffff)); - _mm256i vlo = _mm256_and_si256(mlo, v); - _mm256i vhi = v; - _mm256i clo = _mm256_and_si256(mlo, n); - _mm256i chi = _mm256_and_si256(mlo, _mm256_srli_epi32(n, 16)); - _mm256i rlo = _mm256_srlv_epi32(vlo, clo); - _mm256i rhi = _mm256_andnot_si256(mlo, _mm256_srlv_epi32(vhi, chi)); - return _mm256_or_si256(rhi, rlo); + // TODO: Use permute instead of shift/mask? + _mm256i mlo = _mm256_set1_epi32(U(0x0000ffff)); + _mm256i vlo = _mm256_and_si256(mlo, v); + _mm256i vhi = v; + _mm256i clo = _mm256_and_si256(mlo, n); + _mm256i chi = _mm256_and_si256(mlo, _mm256_srli_epi32(n, 16)); + _mm256i rlo = _mm256_srlv_epi32(vlo, clo); + _mm256i rhi = _mm256_andnot_si256(mlo, _mm256_srlv_epi32(vhi, chi)); + return _mm256_or_si256(rhi, rlo); #else - intvec r; - for (int i=0; i<size; ++i) { - r.set_elt(i, U((*this)[i]) >> U(n[i])); - } - return r; -#endif + intvec r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, U((*this)[i]) >> U(n[i])); } - intvec operator>>(intvec n) const - { + return r; +#endif + } + intvec operator>>(intvec n) const { #ifdef __AVX2__ - intvec_t offset = U(1) << (bits-1); - return (*this + offset).lsr(n) - offset.lsr(n); + intvec_t offset = U(1) << (bits - 1); + return (*this + offset).lsr(n) - offset.lsr(n); #else - intvec r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] >> n[i]); - } - return r; -#endif + intvec r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] >> n[i]); } - intvec operator<<(intvec n) const - { + return r; +#endif + } + intvec operator<<(intvec n) const { #ifdef __AVX2__ - // TODO: Use permute instead of shift/mask? - _mm256i mlo = _mm256_set1_epi32(U(0x0000ffff)); - _mm256i vlo = v; + // TODO: Use permute instead of shift/mask? + _mm256i mlo = _mm256_set1_epi32(U(0x0000ffff)); + _mm256i vlo = v; _mm256i vhi = _mm256_andnot_si256(mlo, v; _mm256i clo = _mm256_and_si256(mlo, n); _mm256i chi = _mm256_and_si256(mlo, _mm256_srli_epi32(n, 16)); @@ -391,338 +322,274 @@ namespace vecmathlib { _mm256i rhi = _mm256_sllv_epi32(vhi, chi); return _mm256_or_si256(rhi, rlo); #else - intvec r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] << n[i]); - } - return r; -#endif + intvec r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] << n[i]); } - intvec& operator>>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<<n; } - - - - boolvec_t operator==(intvec const& x) const - { + return r; +#endif + } + intvec &operator>>=(intvec n) { return *this = *this >> n; } + intvec &operator<<=(intvec n) { return *this = *this << n; } + + boolvec_t operator==(intvec const &x) const { #ifdef __AVX2__ - return _mm256_cmpeq_epi16(v, x.v); + return _mm256_cmpeq_epi16(v, x.v); #else - return ! (*this != x); + return !(*this != x); #endif - } - boolvec_t operator!=(intvec const& x) const - { + } + boolvec_t operator!=(intvec const &x) const { #ifdef __AVX2__ - return ! (*this == x); + return !(*this == x); #else - return (*this ^ x).convert_bool(); + return (*this ^ x).convert_bool(); #endif - } - boolvec_t operator<(intvec const& x) const - { + } + boolvec_t operator<(intvec const &x) const { #ifdef __AVX2__ - return _mm256_cmpgt_epi16(x.v, v); + return _mm256_cmpgt_epi16(x.v, v); #else - // TODO: First compare sign; then if equal, compare sign of difference - // TODO: Also look for intrinsics - boolvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] < x[i]); - } - return r; -#endif - } - boolvec_t operator<=(intvec_t const& x) const - { - return ! (*this > x); - } - boolvec_t operator>(intvec_t const& x) const - { - return x < *this; + // TODO: First compare sign; then if equal, compare sign of difference + // TODO: Also look for intrinsics + boolvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] < x[i]); } - boolvec_t operator>=(intvec_t const& x) const - { - return ! (*this < x); - } - - intvec_t abs() const; - boolvec_t isignbit() const { return as_bool(); } - intvec_t max(intvec_t x) const; - intvec_t min(intvec_t x) const; - }; - - - - template<> - struct realvec<fp16,16>: floatprops<fp16> - { - static int const size = 16; - typedef real_t scalar_t; - typedef __m256i vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { + return r; +#endif + } + boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); } + boolvec_t operator>(intvec_t const &x) const { return x < *this; } + boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); } + + intvec_t abs() const; + boolvec_t isignbit() const { return as_bool(); } + intvec_t max(intvec_t x) const; + intvec_t min(intvec_t x) const; +}; + +template <> struct realvec<fp16, 16> : floatprops<fp16> { + static int const size = 16; + typedef real_t scalar_t; + typedef __m256i vector_t; + static int const alignment = sizeof(vector_t); + + static char const *name() { #ifdef __AVX2__ - return "<AVX2:16*fp16>"; + return "<AVX2:16*fp16>"; #else - return "<AVX:16*fp16>"; + return "<AVX:16*fp16>"; #endif + } + void barrier() { __asm__("" : "+x"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x) : v(x) {} + realvec(real_t a) : v(_mm256_set1_epi16(FP::as_int(a))) {} + realvec(real_t const *as) + : v(_mm256_set_epi16( + FP::as_int(as[15]), FP::as_int(as[14]), FP::as_int(as[13]), + FP::as_int(as[12]), FP::as_int(as[11]), FP::as_int(as[10]), + FP::as_int(as[9]), FP::as_int(as[8]), FP::as_int(as[7]), + FP::as_int(as[6]), FP::as_int(as[5]), FP::as_int(as[4]), + FP::as_int(as[3]), FP::as_int(as[2]), FP::as_int(as[1]), + FP::as_int(as[0]))) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { + return vecmathlib::get_elt<RV, vector_t, real_t>(v, n); + } + realvec_t &set_elt(int n, real_t a) { + return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this; + } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm256_load_si256((__m256i const *)p); + } + static realvec_t loadu(real_t const *p) { + return _mm256_loadu_si256((__m256i const *)p); + } + static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff); + return loadu(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); } - void barrier() { __asm__("": "+x"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm256_set1_epi16(FP::as_int(a))) {} - realvec(real_t const* as): - v(_mm256_set_epi16(FP::as_int(as[15]), - FP::as_int(as[14]), - FP::as_int(as[13]), - FP::as_int(as[12]), - FP::as_int(as[11]), - FP::as_int(as[10]), - FP::as_int(as[ 9]), - FP::as_int(as[ 8]), - FP::as_int(as[ 7]), - FP::as_int(as[ 6]), - FP::as_int(as[ 5]), - FP::as_int(as[ 4]), - FP::as_int(as[ 3]), - FP::as_int(as[ 2]), - FP::as_int(as[ 1]), - FP::as_int(as[ 0]))) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const - { - return vecmathlib::get_elt<RV,vector_t,real_t>(v, n); - } - realvec_t& set_elt(int n, real_t a) - { - return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this; - } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm256_load_si256((__m256i const*)p); - } - static realvec_t loadu(real_t const* p) - { - return _mm256_loadu_si256((__m256i const*)p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm256_store_si256((__m256i*)p, v); - } - void storeu(real_t* p) const - { - return _mm256_storeu_si256((__m256i*)p, v); - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - // TODO: this is expensive - for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n]; - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - // TODO: this is expensive - for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); } - - - - intvec_t as_int() const { return v; } - intvec_t convert_int() const { __builtin_unreachable(); } - - - - realvec operator+() const { __builtin_unreachable(); } - realvec operator-() const { __builtin_unreachable(); } - - realvec operator+(realvec x) const { __builtin_unreachable(); } - realvec operator-(realvec x) const { __builtin_unreachable(); } - realvec operator*(realvec x) const { __builtin_unreachable(); } - realvec operator/(realvec x) const { __builtin_unreachable(); } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t maxval() const { __builtin_unreachable(); } - real_t minval() const { __builtin_unreachable(); } - real_t prod() const { __builtin_unreachable(); } - real_t sum() const { __builtin_unreachable(); } - - - - boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); } - boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); } - boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); } - boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); } - boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); } - boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); } - - - - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec fabs() const { return MF::vml_fabs(*this); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return MF::vml_isnan(*this); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - boolvec_t signbit() const { return v; } - }; - - - - // boolvec definitions - - inline intvec<fp16,16> boolvec<fp16,16>::as_int() const - { - return v; - } - - inline intvec<fp16,16> boolvec<fp16,16>::convert_int() const - { - return lsr(as_int(), bits-1); - } - - inline - boolvec<fp16,16> boolvec<fp16,16>::ifthen(boolvec_t x, boolvec_t y) const - { - return ifthen(x.as_int(), y.as_int()).as_bool(); - } - - inline intvec<fp16,16> boolvec<fp16,16>::ifthen(intvec_t x, intvec_t y) const - { - return (( -convert_int() & x) | (~-convert_int() & y)); - } - - inline - realvec<fp16,16> boolvec<fp16,16>::ifthen(realvec_t x, realvec_t y) const - { - return ifthen(x.as_int(), y.as_int()).as_float(); - } - - - - // intvec definitions - - inline intvec<fp16,16> intvec<fp16,16>::abs() const - { -#ifdef __AVX2__ - return _mm256_abs_epi16(v); -#else - return MF::vml_abs(*this); -#endif } - - inline realvec<fp16,16> intvec<fp16,16>::as_float() const - { - return v; + realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff, m); + return loadu(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm256_store_si256((__m256i *)p, v); } - - inline realvec<fp16,16> intvec<fp16,16>::convert_float() const - { - __builtin_unreachable(); + void storeu(real_t *p) const { return _mm256_storeu_si256((__m256i *)p, v); } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff); + storeu(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + // TODO: this is expensive + for (int n = 0; n < size; ++n) + if (m.m[n]) + p[n] = (*this)[n]; + } } - - inline intvec<fp16,16> intvec<fp16,16>::max(intvec_t x) const - { - return MF::vml_max(*this, x); + void storeu(real_t *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + // TODO: this is expensive + for (int n = 0; n < size; ++n) + if (m.m[n]) + p[n] = (*this)[n]; + } } - - inline intvec<fp16,16> intvec<fp16,16>::min(intvec_t x) const - { - return MF::vml_min(*this, x); + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff, m); + storeu(p + ioff, m); } - + + intvec_t as_int() const { return v; } + intvec_t convert_int() const { __builtin_unreachable(); } + + realvec operator+() const { __builtin_unreachable(); } + realvec operator-() const { __builtin_unreachable(); } + + realvec operator+(realvec x) const { __builtin_unreachable(); } + realvec operator-(realvec x) const { __builtin_unreachable(); } + realvec operator*(realvec x) const { __builtin_unreachable(); } + realvec operator/(realvec x) const { __builtin_unreachable(); } + + realvec &operator+=(realvec const &x) { return *this = *this + x; } + realvec &operator-=(realvec const &x) { return *this = *this - x; } + realvec &operator*=(realvec const &x) { return *this = *this * x; } + realvec &operator/=(realvec const &x) { return *this = *this / x; } + + real_t maxval() const { __builtin_unreachable(); } + real_t minval() const { __builtin_unreachable(); } + real_t prod() const { __builtin_unreachable(); } + real_t sum() const { __builtin_unreachable(); } + + boolvec_t operator==(realvec const &x) const { __builtin_unreachable(); } + boolvec_t operator!=(realvec const &x) const { __builtin_unreachable(); } + boolvec_t operator<(realvec const &x) const { __builtin_unreachable(); } + boolvec_t operator<=(realvec const &x) const { __builtin_unreachable(); } + boolvec_t operator>(realvec const &x) const { __builtin_unreachable(); } + boolvec_t operator>=(realvec const &x) const { __builtin_unreachable(); } + + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec fabs() const { return MF::vml_fabs(*this); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + boolvec_t signbit() const { return v; } +}; + +// boolvec definitions + +inline intvec<fp16, 16> boolvec<fp16, 16>::as_int() const { return v; } + +inline intvec<fp16, 16> boolvec<fp16, 16>::convert_int() const { + return lsr(as_int(), bits - 1); +} + +inline boolvec<fp16, 16> boolvec<fp16, 16>::ifthen(boolvec_t x, + boolvec_t y) const { + return ifthen(x.as_int(), y.as_int()).as_bool(); +} + +inline intvec<fp16, 16> boolvec<fp16, 16>::ifthen(intvec_t x, + intvec_t y) const { + return ((-convert_int() & x) | (~ - convert_int() & y)); +} + +inline realvec<fp16, 16> boolvec<fp16, 16>::ifthen(realvec_t x, + realvec_t y) const { + return ifthen(x.as_int(), y.as_int()).as_float(); +} + +// intvec definitions + +inline intvec<fp16, 16> intvec<fp16, 16>::abs() const { +#ifdef __AVX2__ + return _mm256_abs_epi16(v); +#else + return MF::vml_abs(*this); +#endif +} + +inline realvec<fp16, 16> intvec<fp16, 16>::as_float() const { return v; } + +inline realvec<fp16, 16> intvec<fp16, 16>::convert_float() const { + __builtin_unreachable(); +} + +inline intvec<fp16, 16> intvec<fp16, 16>::max(intvec_t x) const { + return MF::vml_max(*this, x); +} + +inline intvec<fp16, 16> intvec<fp16, 16>::min(intvec_t x) const { + return MF::vml_min(*this, x); +} + } // namespace vecmathlib -#endif // #ifndef VEC_AVX_FP16_16_H +#endif // #ifndef VEC_AVX_FP16_16_H diff --git a/vec_avx_fp8_32.h b/vec_avx_fp8_32.h index 912bd19..0ae79e7 100644 --- a/vec_avx_fp8_32.h +++ b/vec_avx_fp8_32.h @@ -12,763 +12,592 @@ // AVX intrinsics #include <immintrin.h> - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_FP8_32 - template<> struct boolvec<fp8,32>; - template<> struct intvec<fp8,32>; - template<> struct realvec<fp8,32>; - - - - template<> - struct boolvec<fp8,32>: floatprops<fp8> - { - static int const size = 32; - typedef bool scalar_t; - typedef __m256i bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values have the sign bit set, false values have it unset - static uint_t from_bool(bool a) { return - uint_t(a); } - static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } - public: - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(_mm256_set1_epi8(from_bool(a))) {} - boolvec(bool const* as): - v(_mm256_set_epi8(from_bool(as[31]), - from_bool(as[30]), - from_bool(as[29]), - from_bool(as[28]), - from_bool(as[27]), - from_bool(as[26]), - from_bool(as[25]), - from_bool(as[24]), - from_bool(as[23]), - from_bool(as[22]), - from_bool(as[21]), - from_bool(as[20]), - from_bool(as[19]), - from_bool(as[18]), - from_bool(as[17]), - from_bool(as[16]), - from_bool(as[15]), - from_bool(as[14]), - from_bool(as[13]), - from_bool(as[12]), - from_bool(as[11]), - from_bool(as[10]), - from_bool(as[ 9]), - from_bool(as[ 8]), - from_bool(as[ 7]), - from_bool(as[ 6]), - from_bool(as[ 5]), - from_bool(as[ 4]), - from_bool(as[ 3]), - from_bool(as[ 2]), - from_bool(as[ 1]), - from_bool(as[ 0]))) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const - { - return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n)); - } - boolvec& set_elt(int n, bool a) - { - return - vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return *this != boolvec(true); } - - boolvec operator&&(boolvec x) const - { - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - boolvec operator||(boolvec x) const - { - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - boolvec operator==(boolvec x) const { return !(*this!=x); } - boolvec operator!=(boolvec x) const - { - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - - bool all() const - { - bool r = (*this)[0]; - for (int n=1; n<size; ++n) r = r && (*this)[n]; - return r; - } - bool any() const - { - bool r = (*this)[0];; - for (int n=1; n<size; ++n) r = r || (*this)[n]; - return r; - } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<fp8,32>: floatprops<fp8> - { - static int const size = 32; - typedef int_t scalar_t; - typedef __m256i ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm256_set1_epi8(a)) {} - intvec(int_t const* as): - v(_mm256_set_epi8(as[31], - as[30], - as[29], - as[28], - as[27], - as[26], - as[25], - as[24], - as[23], - as[22], - as[21], - as[20], - as[19], - as[18], - as[17], - as[16], - as[15], - as[14], - as[13], - as[12], - as[11], - as[10], - as[ 9], - as[ 8], - as[ 7], - as[ 6], - as[ 5], - as[ 4], - as[ 3], - as[ 2], - as[ 1], - as[ 0])) {} - static intvec iota() - { - return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, - 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, 13, 12, 11, 10, 9, 8, - 7, 6, 5, 4, 3, 2, 1, 0); - } - - operator ivector_t() const { return v; } - int_t operator[](int n) const - { - return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n); - } - intvec_t& set_elt(int n, int_t a) - { - return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this; - } - - - - boolvec_t as_bool() const { return v; } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare to zero. Instead, we check - // whether x is positive and x-1 is negative. - intvec x = *this; - // We know that boolvec values depend only on the sign bit - // return (~(x-1) | x).as_bool(); - // return x.as_bool() || !(x-1).as_bool(); - return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec operator+() const { return *this; } - intvec operator-() const { return IV(I(0)) - *this; } - - intvec operator+(intvec x) const - { +template <> struct boolvec<fp8, 32>; +template <> struct intvec<fp8, 32>; +template <> struct realvec<fp8, 32>; + +template <> struct boolvec<fp8, 32> : floatprops<fp8> { + static int const size = 32; + typedef bool scalar_t; + typedef __m256i bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + +private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return -uint_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + +public: + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v(_mm256_set1_epi8(from_bool(a))) {} + boolvec(bool const *as) + : v(_mm256_set_epi8( + from_bool(as[31]), from_bool(as[30]), from_bool(as[29]), + from_bool(as[28]), from_bool(as[27]), from_bool(as[26]), + from_bool(as[25]), from_bool(as[24]), from_bool(as[23]), + from_bool(as[22]), from_bool(as[21]), from_bool(as[20]), + from_bool(as[19]), from_bool(as[18]), from_bool(as[17]), + from_bool(as[16]), from_bool(as[15]), from_bool(as[14]), + from_bool(as[13]), from_bool(as[12]), from_bool(as[11]), + from_bool(as[10]), from_bool(as[9]), from_bool(as[8]), + from_bool(as[7]), from_bool(as[6]), from_bool(as[5]), + from_bool(as[4]), from_bool(as[3]), from_bool(as[2]), + from_bool(as[1]), from_bool(as[0]))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { + return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n)); + } + boolvec &set_elt(int n, bool a) { + return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)), + *this; + } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec operator!() const { return *this != boolvec(true); } + + boolvec operator&&(boolvec x) const { + return _mm256_castps_si256( + _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); + } + boolvec operator||(boolvec x) const { + return _mm256_castps_si256( + _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); + } + boolvec operator==(boolvec x) const { return !(*this != x); } + boolvec operator!=(boolvec x) const { + return _mm256_castps_si256( + _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); + } + + bool all() const { + bool r = (*this)[0]; + for (int n = 1; n < size; ++n) + r = r && (*this)[n]; + return r; + } + bool any() const { + bool r = (*this)[0]; + ; + for (int n = 1; n < size; ++n) + r = r || (*this)[n]; + return r; + } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<fp8, 32> : floatprops<fp8> { + static int const size = 32; + typedef int_t scalar_t; + typedef __m256i ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x) : v(x) {} + intvec(int_t a) : v(_mm256_set1_epi8(a)) {} + intvec(int_t const *as) + : v(_mm256_set_epi8(as[31], as[30], as[29], as[28], as[27], as[26], + as[25], as[24], as[23], as[22], as[21], as[20], + as[19], as[18], as[17], as[16], as[15], as[14], + as[13], as[12], as[11], as[10], as[9], as[8], as[7], + as[6], as[5], as[4], as[3], as[2], as[1], as[0])) {} + static intvec iota() { + return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, + 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, + 3, 2, 1, 0); + } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { + return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n); + } + intvec_t &set_elt(int n, int_t a) { + return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this; + } + + boolvec_t as_bool() const { return v; } + boolvec_t convert_bool() const { + // Result: convert_bool(0)=false, convert_bool(else)=true + // There is no intrinsic to compare to zero. Instead, we check + // whether x is positive and x-1 is negative. + intvec x = *this; + // We know that boolvec values depend only on the sign bit + // return (~(x-1) | x).as_bool(); + // return x.as_bool() || !(x-1).as_bool(); + return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + // Note: not all arithmetic operations are supported! + + intvec operator+() const { return *this; } + intvec operator-() const { return IV(I(0)) - *this; } + + intvec operator+(intvec x) const { #ifdef __AVX2__ - return _mm256_add_epi8(v, x.v); + return _mm256_add_epi8(v, x.v); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_add_epi8(vlo, xvlo); - vhi = _mm_add_epi8(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_add_epi8(vlo, xvlo); + vhi = _mm_add_epi8(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec operator-(intvec x) const - { + } + intvec operator-(intvec x) const { #ifdef __AVX2__ - return _mm256_sub_epi8(v, x.v); + return _mm256_sub_epi8(v, x.v); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_sub_epi8(vlo, xvlo); - vhi = _mm_sub_epi8(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_sub_epi8(vlo, xvlo); + vhi = _mm_sub_epi8(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - - - - intvec operator~() const { return IV(~U(0)) ^ *this; } - - intvec operator&(intvec x) const - { + } + + intvec &operator+=(intvec const &x) { return *this = *this + x; } + intvec &operator-=(intvec const &x) { return *this = *this - x; } + + intvec operator~() const { return IV(~U(0)) ^ *this; } + + intvec operator&(intvec x) const { #ifdef __AVX2__ - return _mm256_and_si256(v, x.v); + return _mm256_and_si256(v, x.v); #else - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); + return _mm256_castps_si256( + _mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); #endif - } - intvec operator|(intvec x) const - { + } + intvec operator|(intvec x) const { #ifdef __AVX2__ - return _mm256_or_si256(v, x.v); + return _mm256_or_si256(v, x.v); #else - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); + return _mm256_castps_si256( + _mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); #endif - } - intvec operator^(intvec x) const - { + } + intvec operator^(intvec x) const { #ifdef __AVX2__ - return _mm256_xor_si256(v, x.v); + return _mm256_xor_si256(v, x.v); #else - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); + return _mm256_castps_si256( + _mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); #endif - } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - - - intvec lsr(int_t n) const - { + } + + intvec &operator&=(intvec const &x) { return *this = *this & x; } + intvec &operator|=(intvec const &x) { return *this = *this | x; } + intvec &operator^=(intvec const &x) { return *this = *this ^ x; } + + intvec lsr(int_t n) const { #ifdef __AVX2__ - uint_t masklo = U(0x00ffU) >> U(n); - uint_t maskhi = U(0xff00U); - intvec mask = masklo | maskhi; - return intvec(_mm256_srai_epi16(v, n)) & mask; + uint_t masklo = U(0x00ffU) >> U(n); + uint_t maskhi = U(0xff00U); + intvec mask = masklo | maskhi; + return intvec(_mm256_srai_epi16(v, n)) & mask; #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - uint_t masklo = U(0x00ffU) >> U(n); - uint_t maskhi = U(0xff00U); - __m128i mask = _mm_set1_epi16(masklo | maskhi); - vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask); - vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + uint_t masklo = U(0x00ffU) >> U(n); + uint_t maskhi = U(0xff00U); + __m128i mask = _mm_set1_epi16(masklo | maskhi); + vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask); + vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec operator>>(int_t n) const - { + } + intvec operator>>(int_t n) const { #ifdef __AVX2__ - // There is no _mm256_srai_epi8. To emulate it, add 0x80 before - // shifting, and subtract the shifted 0x80 after shifting - intvec_t offset = U(1) << (bits-1); - return (*this + offset).lsr(n) - offset.lsr(n); + // There is no _mm256_srai_epi8. To emulate it, add 0x80 before + // shifting, and subtract the shifted 0x80 after shifting + intvec_t offset = U(1) << (bits - 1); + return (*this + offset).lsr(n) - offset.lsr(n); #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - uint_t masklo = U(0x00ffU); - uint_t maskhi = U(0xff00U); - __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n+8), - _mm_set1_epi16(masklo)); - __m128i vlohi = _mm_and_si128(_mm_srai_epi16(vlo, n), - _mm_set1_epi16(maskhi)); - vlo = _mm_or_si128(vlolo, vlohi); - __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n+8), - _mm_set1_epi16(masklo)); - __m128i vhihi = _mm_and_si128(_mm_srai_epi16(vhi, n), - _mm_set1_epi16(maskhi)); - vhi = _mm_or_si128(vhilo, vhihi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + uint_t masklo = U(0x00ffU); + uint_t maskhi = U(0xff00U); + __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n + 8), + _mm_set1_epi16(masklo)); + __m128i vlohi = + _mm_and_si128(_mm_srai_epi16(vlo, n), _mm_set1_epi16(maskhi)); + vlo = _mm_or_si128(vlolo, vlohi); + __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n + 8), + _mm_set1_epi16(masklo)); + __m128i vhihi = + _mm_and_si128(_mm_srai_epi16(vhi, n), _mm_set1_epi16(maskhi)); + vhi = _mm_or_si128(vhilo, vhihi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif - } - intvec operator<<(int_t n) const - { + } + intvec operator<<(int_t n) const { #ifdef __AVX2__ - uint_t masklo = U(0x00ffU); - uint_t maskhi = U(0xff00U) << U(n); - intvec mask = masklo | maskhi; - return intvec(_mm256_slli_epi16(v, n)) & mask; + uint_t masklo = U(0x00ffU); + uint_t maskhi = U(0xff00U) << U(n); + intvec mask = masklo | maskhi; + return intvec(_mm256_slli_epi16(v, n)) & mask; #else - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - uint_t masklo = U(0x00ffU); - uint_t maskhi = U(0xff00U) << U(n); - __m128i mask = _mm_set1_epi16(masklo | maskhi); - vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask); - vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + uint_t masklo = U(0x00ffU); + uint_t maskhi = U(0xff00U) << U(n); + __m128i mask = _mm_set1_epi16(masklo | maskhi); + vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask); + vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); #endif + } + intvec &operator>>=(int_t n) { return *this = *this >> n; } + intvec &operator<<=(int_t n) { return *this = *this << n; } + + intvec lsr(intvec n) const { + intvec r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, U((*this)[i]) >> U(n[i])); } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<<n; } - - intvec lsr(intvec n) const - { - intvec r; - for (int i=0; i<size; ++i) { - r.set_elt(i, U((*this)[i]) >> U(n[i])); - } - return r; - } - intvec operator>>(intvec n) const - { - intvec r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] >> n[i]); - } - return r; + return r; + } + intvec operator>>(intvec n) const { + intvec r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] >> n[i]); } - intvec operator<<(intvec n) const - { - intvec r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] << n[i]); - } - return r; + return r; + } + intvec operator<<(intvec n) const { + intvec r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] << n[i]); } - intvec& operator>>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<<n; } - - - - boolvec_t operator==(intvec const& x) const - { + return r; + } + intvec &operator>>=(intvec n) { return *this = *this >> n; } + intvec &operator<<=(intvec n) { return *this = *this << n; } + + boolvec_t operator==(intvec const &x) const { #ifdef __AVX2__ - return _mm256_cmpeq_epi8(v, x.v); + return _mm256_cmpeq_epi8(v, x.v); #else - return ! (*this != x); + return !(*this != x); #endif - } - boolvec_t operator!=(intvec const& x) const - { + } + boolvec_t operator!=(intvec const &x) const { #ifdef __AVX2__ - return ! (*this == x); + return !(*this == x); #else - return (*this ^ x).convert_bool(); + return (*this ^ x).convert_bool(); #endif - } - boolvec_t operator<(intvec const& x) const - { + } + boolvec_t operator<(intvec const &x) const { #ifdef __AVX2__ - return _mm256_cmpgt_epi8(x.v, v); + return _mm256_cmpgt_epi8(x.v, v); #else - // TODO: First compare sign; then if equal, compare sign of difference - // TODO: Also look for intrinsics - boolvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] < x[i]); - } - return r; -#endif - } - boolvec_t operator<=(intvec_t const& x) const - { - return ! (*this > x); - } - boolvec_t operator>(intvec_t const& x) const - { - return x < *this; + // TODO: First compare sign; then if equal, compare sign of difference + // TODO: Also look for intrinsics + boolvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] < x[i]); } - boolvec_t operator>=(intvec_t const& x) const - { - return ! (*this < x); - } - - intvec_t abs() const; - boolvec_t isignbit() const { return as_bool(); } - intvec_t max(intvec_t x) const; - intvec_t min(intvec_t x) const; - }; - - - - template<> - struct realvec<fp8,32>: floatprops<fp8> - { - static int const size = 32; - typedef real_t scalar_t; - typedef __m256i vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { + return r; +#endif + } + boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); } + boolvec_t operator>(intvec_t const &x) const { return x < *this; } + boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); } + + intvec_t abs() const; + boolvec_t isignbit() const { return as_bool(); } + intvec_t max(intvec_t x) const; + intvec_t min(intvec_t x) const; +}; + +template <> struct realvec<fp8, 32> : floatprops<fp8> { + static int const size = 32; + typedef real_t scalar_t; + typedef __m256i vector_t; + static int const alignment = sizeof(vector_t); + + static char const *name() { #ifdef __AVX2__ - return "<AVX2:32*fp8>"; + return "<AVX2:32*fp8>"; #else - return "<AVX:32*fp8>"; + return "<AVX:32*fp8>"; #endif + } + void barrier() { __asm__("" : "+x"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x) : v(x) {} + realvec(real_t a) : v(_mm256_set1_epi8(FP::as_int(a))) {} + realvec(real_t const *as) + : v(_mm256_set_epi8( + FP::as_int(as[31]), FP::as_int(as[30]), FP::as_int(as[29]), + FP::as_int(as[28]), FP::as_int(as[27]), FP::as_int(as[26]), + FP::as_int(as[25]), FP::as_int(as[24]), FP::as_int(as[23]), + FP::as_int(as[22]), FP::as_int(as[21]), FP::as_int(as[20]), + FP::as_int(as[19]), FP::as_int(as[18]), FP::as_int(as[17]), + FP::as_int(as[16]), FP::as_int(as[15]), FP::as_int(as[14]), + FP::as_int(as[13]), FP::as_int(as[12]), FP::as_int(as[11]), + FP::as_int(as[10]), FP::as_int(as[9]), FP::as_int(as[8]), + FP::as_int(as[7]), FP::as_int(as[6]), FP::as_int(as[5]), + FP::as_int(as[4]), FP::as_int(as[3]), FP::as_int(as[2]), + FP::as_int(as[1]), FP::as_int(as[0]))) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { + return vecmathlib::get_elt<RV, vector_t, real_t>(v, n); + } + realvec_t &set_elt(int n, real_t a) { + return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this; + } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm256_load_si256((__m256i const *)p); + } + static realvec_t loadu(real_t const *p) { + return _mm256_loadu_si256((__m256i const *)p); + } + static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff); + return loadu(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); } - void barrier() { __asm__("": "+x"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm256_set1_epi8(FP::as_int(a))) {} - realvec(real_t const* as): - v(_mm256_set_epi8(FP::as_int(as[31]), - FP::as_int(as[30]), - FP::as_int(as[29]), - FP::as_int(as[28]), - FP::as_int(as[27]), - FP::as_int(as[26]), - FP::as_int(as[25]), - FP::as_int(as[24]), - FP::as_int(as[23]), - FP::as_int(as[22]), - FP::as_int(as[21]), - FP::as_int(as[20]), - FP::as_int(as[19]), - FP::as_int(as[18]), - FP::as_int(as[17]), - FP::as_int(as[16]), - FP::as_int(as[15]), - FP::as_int(as[14]), - FP::as_int(as[13]), - FP::as_int(as[12]), - FP::as_int(as[11]), - FP::as_int(as[10]), - FP::as_int(as[ 9]), - FP::as_int(as[ 8]), - FP::as_int(as[ 7]), - FP::as_int(as[ 6]), - FP::as_int(as[ 5]), - FP::as_int(as[ 4]), - FP::as_int(as[ 3]), - FP::as_int(as[ 2]), - FP::as_int(as[ 1]), - FP::as_int(as[ 0]))) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const - { - return vecmathlib::get_elt<RV,vector_t,real_t>(v, n); - } - realvec_t& set_elt(int n, real_t a) - { - return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this; - } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm256_load_si256((__m256i const*)p); - } - static realvec_t loadu(real_t const* p) - { - return _mm256_loadu_si256((__m256i const*)p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm256_store_si256((__m256i*)p, v); - } - void storeu(real_t* p) const - { - return _mm256_storeu_si256((__m256i*)p, v); - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - // TODO: this is expensive - for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n]; - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - // TODO: this is expensive - for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); } - - - - intvec_t as_int() const { return v; } - intvec_t convert_int() const { __builtin_unreachable(); } - - - - realvec operator+() const { __builtin_unreachable(); } - realvec operator-() const { __builtin_unreachable(); } - - realvec operator+(realvec x) const { __builtin_unreachable(); } - realvec operator-(realvec x) const { __builtin_unreachable(); } - realvec operator*(realvec x) const { __builtin_unreachable(); } - realvec operator/(realvec x) const { __builtin_unreachable(); } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t maxval() const { __builtin_unreachable(); } - real_t minval() const { __builtin_unreachable(); } - real_t prod() const { __builtin_unreachable(); } - real_t sum() const { __builtin_unreachable(); } - - - - boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); } - boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); } - boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); } - boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); } - boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); } - boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); } - - - - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec fabs() const { return MF::vml_fabs(*this); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return MF::vml_isnan(*this); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - boolvec_t signbit() const { return v; } - }; - - - - // boolvec definitions - - inline intvec<fp8,32> boolvec<fp8,32>::as_int() const - { - return v; - } - - inline intvec<fp8,32> boolvec<fp8,32>::convert_int() const - { - return lsr(as_int(), bits-1); - } - - inline - boolvec<fp8,32> boolvec<fp8,32>::ifthen(boolvec_t x, boolvec_t y) const - { - return ifthen(x.as_int(), y.as_int()).as_bool(); - } - - inline intvec<fp8,32> boolvec<fp8,32>::ifthen(intvec_t x, intvec_t y) const - { - return (( -convert_int() & x) | (~-convert_int() & y)); - } - - inline - realvec<fp8,32> boolvec<fp8,32>::ifthen(realvec_t x, realvec_t y) const - { - return ifthen(x.as_int(), y.as_int()).as_float(); - } - - - - // intvec definitions - - inline intvec<fp8,32> intvec<fp8,32>::abs() const - { -#ifdef __AVX2__ - return _mm256_abs_epi8(v); -#else - return MF::vml_abs(*this); -#endif } - - inline realvec<fp8,32> intvec<fp8,32>::as_float() const - { - return v; + realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff, m); + return loadu(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm256_store_si256((__m256i *)p, v); } - - inline realvec<fp8,32> intvec<fp8,32>::convert_float() const - { - __builtin_unreachable(); + void storeu(real_t *p) const { return _mm256_storeu_si256((__m256i *)p, v); } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff); + storeu(p + ioff); } - - inline intvec<fp8,32> intvec<fp8,32>::max(intvec_t x) const - { - return MF::vml_max(*this, x); + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + // TODO: this is expensive + for (int n = 0; n < size; ++n) + if (m.m[n]) + p[n] = (*this)[n]; + } } - - inline intvec<fp8,32> intvec<fp8,32>::min(intvec_t x) const - { - return MF::vml_min(*this, x); + void storeu(real_t *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + // TODO: this is expensive + for (int n = 0; n < size; ++n) + if (m.m[n]) + p[n] = (*this)[n]; + } } - + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff, m); + storeu(p + ioff, m); + } + + intvec_t as_int() const { return v; } + intvec_t convert_int() const { __builtin_unreachable(); } + + realvec operator+() const { __builtin_unreachable(); } + realvec operator-() const { __builtin_unreachable(); } + + realvec operator+(realvec x) const { __builtin_unreachable(); } + realvec operator-(realvec x) const { __builtin_unreachable(); } + realvec operator*(realvec x) const { __builtin_unreachable(); } + realvec operator/(realvec x) const { __builtin_unreachable(); } + + realvec &operator+=(realvec const &x) { return *this = *this + x; } + realvec &operator-=(realvec const &x) { return *this = *this - x; } + realvec &operator*=(realvec const &x) { return *this = *this * x; } + realvec &operator/=(realvec const &x) { return *this = *this / x; } + + real_t maxval() const { __builtin_unreachable(); } + real_t minval() const { __builtin_unreachable(); } + real_t prod() const { __builtin_unreachable(); } + real_t sum() const { __builtin_unreachable(); } + + boolvec_t operator==(realvec const &x) const { __builtin_unreachable(); } + boolvec_t operator!=(realvec const &x) const { __builtin_unreachable(); } + boolvec_t operator<(realvec const &x) const { __builtin_unreachable(); } + boolvec_t operator<=(realvec const &x) const { __builtin_unreachable(); } + boolvec_t operator>(realvec const &x) const { __builtin_unreachable(); } + boolvec_t operator>=(realvec const &x) const { __builtin_unreachable(); } + + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec fabs() const { return MF::vml_fabs(*this); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + boolvec_t signbit() const { return v; } +}; + +// boolvec definitions + +inline intvec<fp8, 32> boolvec<fp8, 32>::as_int() const { return v; } + +inline intvec<fp8, 32> boolvec<fp8, 32>::convert_int() const { + return lsr(as_int(), bits - 1); +} + +inline boolvec<fp8, 32> boolvec<fp8, 32>::ifthen(boolvec_t x, + boolvec_t y) const { + return ifthen(x.as_int(), y.as_int()).as_bool(); +} + +inline intvec<fp8, 32> boolvec<fp8, 32>::ifthen(intvec_t x, intvec_t y) const { + return ((-convert_int() & x) | (~ - convert_int() & y)); +} + +inline realvec<fp8, 32> boolvec<fp8, 32>::ifthen(realvec_t x, + realvec_t y) const { + return ifthen(x.as_int(), y.as_int()).as_float(); +} + +// intvec definitions + +inline intvec<fp8, 32> intvec<fp8, 32>::abs() const { +#ifdef __AVX2__ + return _mm256_abs_epi8(v); +#else + return MF::vml_abs(*this); +#endif +} + +inline realvec<fp8, 32> intvec<fp8, 32>::as_float() const { return v; } + +inline realvec<fp8, 32> intvec<fp8, 32>::convert_float() const { + __builtin_unreachable(); +} + +inline intvec<fp8, 32> intvec<fp8, 32>::max(intvec_t x) const { + return MF::vml_max(*this, x); +} + +inline intvec<fp8, 32> intvec<fp8, 32>::min(intvec_t x) const { + return MF::vml_min(*this, x); +} + } // namespace vecmathlib -#endif // #ifndef VEC_AVX_FP8_32_H +#endif // #ifndef VEC_AVX_FP8_32_H @@ -4,663 +4,544 @@ #define VEC_BASE_H #ifndef VML_NO_IOSTREAM -# include <iostream> +#include <iostream> #endif #include "vec_mask.h" +namespace vecmathlib { +template <typename real_t, int size> struct boolvec {}; -namespace vecmathlib { - - template<typename real_t, int size> - struct boolvec { - }; - - template<typename real_t, int size> - struct intvec { - }; - - template<typename real_t, int size> - struct realvec { - }; - - - - // boolvec wrappers - - template<typename real_t, int size> - inline intvec<real_t, size> as_int(boolvec<real_t, size> x) - { - return x.as_int(); - } - - template<typename real_t, int size> - inline intvec<real_t, size> convert_int(boolvec<real_t, size> x) - { - return x.convert_int(); - } - - template<typename real_t, int size> - inline bool all(boolvec<real_t, size> x) { return x.all(); } - - template<typename real_t, int size> - inline bool any(boolvec<real_t, size> x) { return x.any(); } - - template<typename real_t, int size> - inline - boolvec<real_t, size> ifthen(boolvec<real_t, size> c, - boolvec<real_t, size> x, - boolvec<real_t, size> y) - { - return c.ifthen(x, y); - } - - template<typename real_t, int size> - inline - intvec<real_t, size> ifthen(boolvec<real_t, size> c, - intvec<real_t, size> x, - intvec<real_t, size> y) - { - return c.ifthen(x, y); - } - - template<typename real_t, int size> - inline - realvec<real_t, size> ifthen(boolvec<real_t, size> c, - realvec<real_t, size> x, - realvec<real_t, size> y) - { - return c.ifthen(x, y); - } - - - - // intvec wrappers - - template<typename real_t, int size> - inline boolvec<real_t, size> as_bool(intvec<real_t, size> x) - { - return x.as_bool(); - } - - template<typename real_t, int size> - inline boolvec<real_t, size> convert_bool(intvec<real_t, size> x) - { - return x.convert_bool(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> as_float(intvec<real_t, size> x) - { - return x.as_float(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> convert_float(intvec<real_t, size> x) - { - return x.convert_float(); - } - - template<typename real_t, int size> - inline intvec<real_t, size> abs(intvec<real_t, size> x) - { - return x.abs(); - } - - template<typename real_t, int size> - inline intvec<real_t, size> bitifthen(intvec<real_t, size> x, - intvec<real_t, size> y, - intvec<real_t, size> z) - { - return x.bitifthen(y, z); - } - - template<typename real_t, int size> - inline intvec<real_t, size> clz(intvec<real_t, size> x) - { - return x.clz(); - } - - template<typename real_t, int size> - inline boolvec<real_t, size> isignbit(intvec<real_t, size> x) - { - return x.isignbit(); - } - - template<typename real_t, int size> - inline intvec<real_t, size> lsr(intvec<real_t, size> x, - typename intvec<real_t, size>::int_t n) - { - return x.lsr(n); - } - - template<typename real_t, int size> - inline intvec<real_t, size> lsr(intvec<real_t, size> x, - intvec<real_t, size> n) - { - return x.lsr(n); - } - - template<typename real_t, int size> - inline intvec<real_t, size> max(intvec<real_t, size> x, - intvec<real_t, size> y) - { - return x.max(y); - } - - template<typename real_t, int size> - inline intvec<real_t, size> min(intvec<real_t, size> x, - intvec<real_t, size> y) - { - return x.min(y); - } - - template<typename real_t, int size> - inline intvec<real_t, size> popcount(intvec<real_t, size> x) - { - return x.popcount(); - } - - template<typename real_t, int size> - inline intvec<real_t, size> rotate(intvec<real_t, size> x, - typename intvec<real_t, size>::int_t n) - { - return x.rotate(n); - } - - template<typename real_t, int size> - inline intvec<real_t, size> rotate(intvec<real_t, size> x, - intvec<real_t, size> n) - { - return x.rotate(n); - } - - - - // realvec wrappers - - template<typename real_t, int size> - inline realvec<real_t, size> - loada(real_t const* p, - realvec<real_t, size> x, - typename realvec<real_t, size>::mask_t const& m) - { - return x.loada(p, m); - } - - template<typename real_t, int size> - inline realvec<real_t, size> - loadu(real_t const* p, - realvec<real_t, size> x, - typename realvec<real_t, size>::mask_t const& m) - { - return x.loadu(p, m); - } - - template<typename real_t, int size> - inline realvec<real_t, size> - loadu(real_t const* p, size_t ioff, - realvec<real_t, size> x, - typename realvec<real_t, size>::mask_t const& m) - { - return x.loadu(p, ioff, m); - } - - template<typename real_t, int size> - inline void storea(realvec<real_t, size> x, real_t* p) - { - x.storea(p); - } - - template<typename real_t, int size> - inline void storeu(realvec<real_t, size> x, real_t* p) - { - x.storeu(p); - } - - template<typename real_t, int size> - inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff) - { - x.storeu(p, ioff); - } - - template<typename real_t, int size> - inline void storea(realvec<real_t, size> x, real_t* p, - typename realvec<real_t, size>::mask_t const& m) - { - x.storea(p, m); - } - - template<typename real_t, int size> - inline void storeu(realvec<real_t, size> x, real_t* p, - typename realvec<real_t, size>::mask_t const& m) - { - x.storeu(p, m); - } - - template<typename real_t, int size> - inline void storeu(realvec<real_t, size> x, real_t* p, size_t ioff, - typename realvec<real_t, size>::mask_t const &m) - { - x.storeu(p, ioff, m); - } - - - - template<typename real_t, int size> - inline intvec<real_t, size> as_int(realvec<real_t, size> x) - { - return x.as_int(); - } - - template<typename real_t, int size> - inline intvec<real_t, size> convert_int(realvec<real_t, size> x) - { - return x.convert_int(); - } - - template<typename real_t, int size> - inline - typename realvec<real_t, size>::real_t maxval(realvec<real_t, size> x) - { - return x.maxval(); - } - - template<typename real_t, int size> - inline - typename realvec<real_t, size>::real_t minval(realvec<real_t, size> x) - { - return x.minval(); - } - - template<typename real_t, int size> - inline - typename realvec<real_t, size>::real_t prod(realvec<real_t, size> x) - { - return x.prod(); - } - - template<typename real_t, int size> - inline - typename realvec<real_t, size>::real_t sum(realvec<real_t, size> x) - { - return x.sum(); - } - - - - template<typename real_t, int size> - inline realvec<real_t, size> acos(realvec<real_t, size> x) - { - return x.acos(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> acosh(realvec<real_t, size> x) - { - return x.acosh(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> asin(realvec<real_t, size> x) - { - return x.asin(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> asinh(realvec<real_t, size> x) - { - return x.asinh(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> atan(realvec<real_t, size> x) - { - return x.atan(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> atan2(realvec<real_t, size> x, - realvec<real_t, size> y) - { - return x.atan2(y); - } - - template<typename real_t, int size> - inline realvec<real_t, size> atanh(realvec<real_t, size> x) - { - return x.atanh(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> cbrt(realvec<real_t, size> x) - { - return x.cbrt(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> ceil(realvec<real_t, size> x) - { - return x.ceil(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> copysign(realvec<real_t, size> x, - realvec<real_t, size> y) - { - return x.copysign(y); - } - - template<typename real_t, int size> - inline realvec<real_t, size> cos(realvec<real_t, size> x) - { - return x.cos(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> cosh(realvec<real_t, size> x) - { - return x.cosh(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> exp(realvec<real_t, size> x) - { - return x.exp(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> exp10(realvec<real_t, size> x) - { - return x.exp10(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> exp2(realvec<real_t, size> x) - { - return x.exp2(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> expm1(realvec<real_t, size> x) - { - return x.expm1(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> fabs(realvec<real_t, size> x) - { - return x.fabs(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> floor(realvec<real_t, size> x) - { - return x.floor(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> fdim(realvec<real_t, size> x, - realvec<real_t, size> y) - { - return x.fdim(y); - } - - template<typename real_t, int size> - inline realvec<real_t, size> fma(realvec<real_t, size> x, - realvec<real_t, size> y, - realvec<real_t, size> z) - { - return x.fma(y, z); - } - - template<typename real_t, int size> - inline realvec<real_t, size> fmax(realvec<real_t, size> x, - realvec<real_t, size> y) - { - return x.fmax(y); - } - - template<typename real_t, int size> - inline realvec<real_t, size> fmin(realvec<real_t, size> x, - realvec<real_t, size> y) - { - return x.fmin(y); - } - - template<typename real_t, int size> - inline realvec<real_t, size> fmod(realvec<real_t, size> x, - realvec<real_t, size> y) - { - return x.fmod(y); - } - - template<typename real_t, int size> - inline realvec<real_t, size> frexp(realvec<real_t, size> x, - intvec<real_t, size>* r) - { - return x.frexp(r); - } - - template<typename real_t, int size> - inline realvec<real_t, size> hypot(realvec<real_t, size> x, - realvec<real_t, size> y) - { - return x.hypot(y); - } - - template<typename real_t, int size> - inline intvec<real_t, size> ilogb(realvec<real_t, size> x) - { - return x.ilogb(); - } - - template<typename real_t, int size> - inline boolvec<real_t, size> isfinite(realvec<real_t, size> x) - { - return x.isfinite(); - } - - template<typename real_t, int size> - inline boolvec<real_t, size> isinf(realvec<real_t, size> x) - { - return x.isinf(); - } - - template<typename real_t, int size> - inline boolvec<real_t, size> isnan(realvec<real_t, size> x) - { - return x.isnan(); - } - - template<typename real_t, int size> - inline boolvec<real_t, size> isnormal(realvec<real_t, size> x) - { - return x.isnormal(); - } - - template<typename real_t, int size> - inline - realvec<real_t, size> ldexp(realvec<real_t, size> x, - typename intvec<real_t, size>::int_t n) - { - return x.ldexp(n); - } - - template<typename real_t, int size> - inline - realvec<real_t, size> ldexp(realvec<real_t, size> x, - intvec<real_t, size> n) - { - return x.ldexp(n); - } - - template<typename real_t, int size> - inline realvec<real_t, size> log(realvec<real_t, size> x) - { - return x.log(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> log10(realvec<real_t, size> x) - { - return x.log10(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> log1p(realvec<real_t, size> x) - { - return x.log1p(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> log2(realvec<real_t, size> x) - { - return x.log2(); - } - - template<typename real_t, int size> - inline intvec<real_t, size> lrint(realvec<real_t, size> x) - { - return x.lrint(); - } +template <typename real_t, int size> struct intvec {}; + +template <typename real_t, int size> struct realvec {}; + +// boolvec wrappers + +template <typename real_t, int size> +inline intvec<real_t, size> as_int(boolvec<real_t, size> x) { + return x.as_int(); +} + +template <typename real_t, int size> +inline intvec<real_t, size> convert_int(boolvec<real_t, size> x) { + return x.convert_int(); +} + +template <typename real_t, int size> inline bool all(boolvec<real_t, size> x) { + return x.all(); +} + +template <typename real_t, int size> inline bool any(boolvec<real_t, size> x) { + return x.any(); +} + +template <typename real_t, int size> +inline boolvec<real_t, size> ifthen(boolvec<real_t, size> c, + boolvec<real_t, size> x, + boolvec<real_t, size> y) { + return c.ifthen(x, y); +} + +template <typename real_t, int size> +inline intvec<real_t, size> ifthen(boolvec<real_t, size> c, + intvec<real_t, size> x, + intvec<real_t, size> y) { + return c.ifthen(x, y); +} + +template <typename real_t, int size> +inline realvec<real_t, size> ifthen(boolvec<real_t, size> c, + realvec<real_t, size> x, + realvec<real_t, size> y) { + return c.ifthen(x, y); +} + +// intvec wrappers + +template <typename real_t, int size> +inline boolvec<real_t, size> as_bool(intvec<real_t, size> x) { + return x.as_bool(); +} + +template <typename real_t, int size> +inline boolvec<real_t, size> convert_bool(intvec<real_t, size> x) { + return x.convert_bool(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> as_float(intvec<real_t, size> x) { + return x.as_float(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> convert_float(intvec<real_t, size> x) { + return x.convert_float(); +} + +template <typename real_t, int size> +inline intvec<real_t, size> abs(intvec<real_t, size> x) { + return x.abs(); +} + +template <typename real_t, int size> +inline intvec<real_t, size> bitifthen(intvec<real_t, size> x, + intvec<real_t, size> y, + intvec<real_t, size> z) { + return x.bitifthen(y, z); +} + +template <typename real_t, int size> +inline intvec<real_t, size> clz(intvec<real_t, size> x) { + return x.clz(); +} + +template <typename real_t, int size> +inline boolvec<real_t, size> isignbit(intvec<real_t, size> x) { + return x.isignbit(); +} + +template <typename real_t, int size> +inline intvec<real_t, size> lsr(intvec<real_t, size> x, + typename intvec<real_t, size>::int_t n) { + return x.lsr(n); +} + +template <typename real_t, int size> +inline intvec<real_t, size> lsr(intvec<real_t, size> x, + intvec<real_t, size> n) { + return x.lsr(n); +} + +template <typename real_t, int size> +inline intvec<real_t, size> max(intvec<real_t, size> x, + intvec<real_t, size> y) { + return x.max(y); +} + +template <typename real_t, int size> +inline intvec<real_t, size> min(intvec<real_t, size> x, + intvec<real_t, size> y) { + return x.min(y); +} + +template <typename real_t, int size> +inline intvec<real_t, size> popcount(intvec<real_t, size> x) { + return x.popcount(); +} + +template <typename real_t, int size> +inline intvec<real_t, size> rotate(intvec<real_t, size> x, + typename intvec<real_t, size>::int_t n) { + return x.rotate(n); +} + +template <typename real_t, int size> +inline intvec<real_t, size> rotate(intvec<real_t, size> x, + intvec<real_t, size> n) { + return x.rotate(n); +} + +// realvec wrappers + +template <typename real_t, int size> +inline realvec<real_t, size> +loada(real_t const *p, realvec<real_t, size> x, + typename realvec<real_t, size>::mask_t const &m) { + return x.loada(p, m); +} + +template <typename real_t, int size> +inline realvec<real_t, size> +loadu(real_t const *p, realvec<real_t, size> x, + typename realvec<real_t, size>::mask_t const &m) { + return x.loadu(p, m); +} + +template <typename real_t, int size> +inline realvec<real_t, size> +loadu(real_t const *p, size_t ioff, realvec<real_t, size> x, + typename realvec<real_t, size>::mask_t const &m) { + return x.loadu(p, ioff, m); +} + +template <typename real_t, int size> +inline void storea(realvec<real_t, size> x, real_t *p) { + x.storea(p); +} + +template <typename real_t, int size> +inline void storeu(realvec<real_t, size> x, real_t *p) { + x.storeu(p); +} + +template <typename real_t, int size> +inline void storeu(realvec<real_t, size> x, real_t *p, size_t ioff) { + x.storeu(p, ioff); +} + +template <typename real_t, int size> +inline void storea(realvec<real_t, size> x, real_t *p, + typename realvec<real_t, size>::mask_t const &m) { + x.storea(p, m); +} + +template <typename real_t, int size> +inline void storeu(realvec<real_t, size> x, real_t *p, + typename realvec<real_t, size>::mask_t const &m) { + x.storeu(p, m); +} + +template <typename real_t, int size> +inline void storeu(realvec<real_t, size> x, real_t *p, size_t ioff, + typename realvec<real_t, size>::mask_t const &m) { + x.storeu(p, ioff, m); +} + +template <typename real_t, int size> +inline intvec<real_t, size> as_int(realvec<real_t, size> x) { + return x.as_int(); +} + +template <typename real_t, int size> +inline intvec<real_t, size> convert_int(realvec<real_t, size> x) { + return x.convert_int(); +} + +template <typename real_t, int size> +inline typename realvec<real_t, size>::real_t maxval(realvec<real_t, size> x) { + return x.maxval(); +} + +template <typename real_t, int size> +inline typename realvec<real_t, size>::real_t minval(realvec<real_t, size> x) { + return x.minval(); +} + +template <typename real_t, int size> +inline typename realvec<real_t, size>::real_t prod(realvec<real_t, size> x) { + return x.prod(); +} + +template <typename real_t, int size> +inline typename realvec<real_t, size>::real_t sum(realvec<real_t, size> x) { + return x.sum(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> acos(realvec<real_t, size> x) { + return x.acos(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> acosh(realvec<real_t, size> x) { + return x.acosh(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> asin(realvec<real_t, size> x) { + return x.asin(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> asinh(realvec<real_t, size> x) { + return x.asinh(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> atan(realvec<real_t, size> x) { + return x.atan(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> atan2(realvec<real_t, size> x, + realvec<real_t, size> y) { + return x.atan2(y); +} + +template <typename real_t, int size> +inline realvec<real_t, size> atanh(realvec<real_t, size> x) { + return x.atanh(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> cbrt(realvec<real_t, size> x) { + return x.cbrt(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> ceil(realvec<real_t, size> x) { + return x.ceil(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> copysign(realvec<real_t, size> x, + realvec<real_t, size> y) { + return x.copysign(y); +} + +template <typename real_t, int size> +inline realvec<real_t, size> cos(realvec<real_t, size> x) { + return x.cos(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> cosh(realvec<real_t, size> x) { + return x.cosh(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> exp(realvec<real_t, size> x) { + return x.exp(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> exp10(realvec<real_t, size> x) { + return x.exp10(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> exp2(realvec<real_t, size> x) { + return x.exp2(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> expm1(realvec<real_t, size> x) { + return x.expm1(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> fabs(realvec<real_t, size> x) { + return x.fabs(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> floor(realvec<real_t, size> x) { + return x.floor(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> fdim(realvec<real_t, size> x, + realvec<real_t, size> y) { + return x.fdim(y); +} + +template <typename real_t, int size> +inline realvec<real_t, size> +fma(realvec<real_t, size> x, realvec<real_t, size> y, realvec<real_t, size> z) { + return x.fma(y, z); +} + +template <typename real_t, int size> +inline realvec<real_t, size> fmax(realvec<real_t, size> x, + realvec<real_t, size> y) { + return x.fmax(y); +} + +template <typename real_t, int size> +inline realvec<real_t, size> fmin(realvec<real_t, size> x, + realvec<real_t, size> y) { + return x.fmin(y); +} + +template <typename real_t, int size> +inline realvec<real_t, size> fmod(realvec<real_t, size> x, + realvec<real_t, size> y) { + return x.fmod(y); +} + +template <typename real_t, int size> +inline realvec<real_t, size> frexp(realvec<real_t, size> x, + intvec<real_t, size> *r) { + return x.frexp(r); +} + +template <typename real_t, int size> +inline realvec<real_t, size> hypot(realvec<real_t, size> x, + realvec<real_t, size> y) { + return x.hypot(y); +} + +template <typename real_t, int size> +inline intvec<real_t, size> ilogb(realvec<real_t, size> x) { + return x.ilogb(); +} + +template <typename real_t, int size> +inline boolvec<real_t, size> isfinite(realvec<real_t, size> x) { + return x.isfinite(); +} + +template <typename real_t, int size> +inline boolvec<real_t, size> isinf(realvec<real_t, size> x) { + return x.isinf(); +} + +template <typename real_t, int size> +inline boolvec<real_t, size> isnan(realvec<real_t, size> x) { + return x.isnan(); +} + +template <typename real_t, int size> +inline boolvec<real_t, size> isnormal(realvec<real_t, size> x) { + return x.isnormal(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> ldexp(realvec<real_t, size> x, + typename intvec<real_t, size>::int_t n) { + return x.ldexp(n); +} + +template <typename real_t, int size> +inline realvec<real_t, size> ldexp(realvec<real_t, size> x, + intvec<real_t, size> n) { + return x.ldexp(n); +} + +template <typename real_t, int size> +inline realvec<real_t, size> log(realvec<real_t, size> x) { + return x.log(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> log10(realvec<real_t, size> x) { + return x.log10(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> log1p(realvec<real_t, size> x) { + return x.log1p(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> log2(realvec<real_t, size> x) { + return x.log2(); +} + +template <typename real_t, int size> +inline intvec<real_t, size> lrint(realvec<real_t, size> x) { + return x.lrint(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> +mad(realvec<real_t, size> x, realvec<real_t, size> y, realvec<real_t, size> z) { + return x.mad(y, z); +} + +template <typename real_t, int size> +inline realvec<real_t, size> nextafter(realvec<real_t, size> x, + realvec<real_t, size> y) { + return x.nextafter(y); +} + +template <typename real_t, int size> +inline realvec<real_t, size> pow(realvec<real_t, size> x, + realvec<real_t, size> y) { + return x.pow(y); +} + +template <typename real_t, int size> +inline realvec<real_t, size> rcp(realvec<real_t, size> x) { + return x.rcp(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> remainder(realvec<real_t, size> x, + realvec<real_t, size> y) { + return x.remainder(y); +} + +template <typename real_t, int size> +inline realvec<real_t, size> rint(realvec<real_t, size> x) { + return x.rint(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> round(realvec<real_t, size> x) { + return x.round(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> rsqrt(realvec<real_t, size> x) { + return x.rsqrt(); +} + +template <typename real_t, int size> +inline boolvec<real_t, size> signbit(realvec<real_t, size> x) { + return x.signbit(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> sin(realvec<real_t, size> x) { + return x.sin(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> sinh(realvec<real_t, size> x) { + return x.sinh(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> sqrt(realvec<real_t, size> x) { + return x.sqrt(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> tan(realvec<real_t, size> x) { + return x.tan(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> tanh(realvec<real_t, size> x) { + return x.tanh(); +} + +template <typename real_t, int size> +inline realvec<real_t, size> trunc(realvec<real_t, size> x) { + return x.trunc(); +} - template<typename real_t, int size> - inline realvec<real_t, size> mad(realvec<real_t, size> x, - realvec<real_t, size> y, - realvec<real_t, size> z) - { - return x.mad(y, z); - } - - template<typename real_t, int size> - inline realvec<real_t, size> nextafter(realvec<real_t, size> x, - realvec<real_t, size> y) - { - return x.nextafter(y); - } - - template<typename real_t, int size> - inline realvec<real_t, size> pow(realvec<real_t, size> x, - realvec<real_t, size> y) - { - return x.pow(y); - } - - template<typename real_t, int size> - inline realvec<real_t, size> rcp(realvec<real_t, size> x) - { - return x.rcp(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> remainder(realvec<real_t, size> x, - realvec<real_t, size> y) - { - return x.remainder(y); - } - - template<typename real_t, int size> - inline realvec<real_t, size> rint(realvec<real_t, size> x) - { - return x.rint(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> round(realvec<real_t, size> x) - { - return x.round(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> rsqrt(realvec<real_t, size> x) - { - return x.rsqrt(); - } - - template<typename real_t, int size> - inline boolvec<real_t, size> signbit(realvec<real_t, size> x) - { - return x.signbit(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> sin(realvec<real_t, size> x) - { - return x.sin(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> sinh(realvec<real_t, size> x) - { - return x.sinh(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> sqrt(realvec<real_t, size> x) - { - return x.sqrt(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> tan(realvec<real_t, size> x) - { - return x.tan(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> tanh(realvec<real_t, size> x) - { - return x.tanh(); - } - - template<typename real_t, int size> - inline realvec<real_t, size> trunc(realvec<real_t, size> x) - { - return x.trunc(); - } - - - #ifndef VML_NO_IOSTREAM - template<typename real_t, int size> - std::ostream& operator<<(std::ostream& os, boolvec<real_t, size> const& x) - { - os << "["; - for (int i=0; i<size; ++i) { - if (i!=0) os << ","; - os << x[i]; - } - os << "]"; - return os; - } - - template<typename real_t, int size> - std::ostream& operator<<(std::ostream& os, intvec<real_t, size> const& x) - { - os << "["; - for (int i=0; i<size; ++i) { - if (i!=0) os << ","; - os << x[i]; - } - os << "]"; - return os; - } - - template<typename real_t, int size> - std::ostream& operator<<(std::ostream& os, realvec<real_t, size> const& x) - { - os << "["; - for (int i=0; i<size; ++i) { - if (i!=0) os << ","; - os << x[i]; - } - os << "]"; - return os; - } +template <typename real_t, int size> +std::ostream &operator<<(std::ostream &os, boolvec<real_t, size> const &x) { + os << "["; + for (int i = 0; i < size; ++i) { + if (i != 0) + os << ","; + os << x[i]; + } + os << "]"; + return os; +} + +template <typename real_t, int size> +std::ostream &operator<<(std::ostream &os, intvec<real_t, size> const &x) { + os << "["; + for (int i = 0; i < size; ++i) { + if (i != 0) + os << ","; + os << x[i]; + } + os << "]"; + return os; +} + +template <typename real_t, int size> +std::ostream &operator<<(std::ostream &os, realvec<real_t, size> const &x) { + os << "["; + for (int i = 0; i < size; ++i) { + if (i != 0) + os << ","; + os << x[i]; + } + os << "]"; + return os; +} #endif - + } // namespace vecmathlib -#endif // #ifndef VEC_BASE_H +#endif // #ifndef VEC_BASE_H diff --git a/vec_builtin.h b/vec_builtin.h index bbe4277..2f1ff90 100644 --- a/vec_builtin.h +++ b/vec_builtin.h @@ -12,1450 +12,1253 @@ #include <cmath> #include <cstring> #ifndef VML_NO_IOSTREAM -# include <sstream> +#include <sstream> #endif #include <string> +namespace vecmathlib { +template <typename T, int N> struct boolbuiltinvec; +template <typename T, int N> struct intbuiltinvec; +template <typename T, int N> struct realbuiltinvec; -namespace vecmathlib { - - template<typename T, int N> struct boolbuiltinvec; - template<typename T, int N> struct intbuiltinvec; - template<typename T, int N> struct realbuiltinvec; - - - - template<typename T, int N> - struct boolbuiltinvec: floatprops<T> - { - typedef typename floatprops<T>::int_t int_t; - typedef typename floatprops<T>::uint_t uint_t; - typedef typename floatprops<T>::real_t real_t; - - static const int size = N; - typedef bool scalar_t; - typedef int_t bvector_t __attribute__((__ext_vector_type__(N))); - static const int alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true is -1, false is 0 - static int_t from_bool(bool a) { return -uint_t(a); } - static bool to_bool(int_t a) { return a; } - public: - - typedef boolbuiltinvec boolvec_t; - typedef intbuiltinvec<real_t, size> intvec_t; - typedef realbuiltinvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolbuiltinvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolbuiltinvec(const boolbuiltinvec& x): v(x.v) {} - // boolbuiltinvec& operator=(const boolbuiltinvec& x) { return v=x.v, *this; } - // Can't have a constructor from bvector_t, since this would - // conflict with the constructor from bool - // boolbuiltinvec(bvector_t x): v(x) {} - static boolvec_t mkvec(bvector_t x) { boolvec_t res; res.v=x; return res; } - boolbuiltinvec(bool a): v(from_bool(a)) {} - boolbuiltinvec(const bool* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - - operator bvector_t() const { return v; } - bool operator[](int n) const { return to_bool(v[n]); } - boolvec_t& set_elt(int n, bool a) { return v[n]=from_bool(a), *this; } - - - - intvec_t as_int() const; // defined after intbuiltinvec - intvec_t convert_int() const; // defined after intbuiltinvec - - - - boolvec_t operator!() const { return mkvec(!v); } - - boolvec_t operator&&(boolvec_t x) const { return mkvec(v && x.v); } - boolvec_t operator||(boolvec_t x) const { return mkvec(v || x.v); } - boolvec_t operator==(boolvec_t x) const { return mkvec(v == x.v); } - boolvec_t operator!=(boolvec_t x) const { return mkvec(v != x.v); } - - bool all() const - { - bool res = (*this)[0]; - for (int d=1; d<size; ++d) res = res && (*this)[d]; - return res; - } - bool any() const - { - bool res = (*this)[0]; - for (int d=1; d<size; ++d) res = res || (*this)[d]; - return res; - } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intbuiltinvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realbuiltinvec - }; - - - - template<typename T, int N> - struct intbuiltinvec: floatprops<T> - { - typedef typename floatprops<T>::int_t int_t; - typedef typename floatprops<T>::uint_t uint_t; - typedef typename floatprops<T>::real_t real_t; - - static const int size = N; - typedef int_t scalar_t; - typedef int_t ivector_t __attribute__((__ext_vector_type__(N))); - typedef uint_t uvector_t __attribute__((__ext_vector_type__(N))); - static const int alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - static_assert(size * sizeof(real_t) == sizeof(uvector_t), - "vector size is wrong"); - - typedef boolbuiltinvec<real_t, size> boolvec_t; - typedef intbuiltinvec intvec_t; - typedef realbuiltinvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intbuiltinvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intbuiltinvec(const intbuiltinvec& x): v(x.v) {} - // intbuiltinvec& operator=(const intbuiltinvec& x) { return v=x.v, *this; } - // Can't have a constructor from ivector_t, since this would - // conflict with the constructor from int_t - // intbuiltinvec(ivector_t x): v(x) {} - static intvec_t mkvec(ivector_t x) { intvec_t res; res.v=x; return res; } - intbuiltinvec(int_t a): v(a) {} - intbuiltinvec(const int_t* as) { std::memcpy(&v, as, sizeof v); } - static intvec_t iota() - { - intvec_t res; - for (int d=0; d<size; ++d) res.set_elt(d, d); - return res; - } - - int_t operator[](int n) const { return v[n]; } - intvec_t& set_elt(int n, int_t a) { return v[n]=a, *this; } - - - - boolvec_t as_bool() const - { - boolvec_t res; - std::memcpy(&res.v, &v, sizeof res.v); - return res; - } - boolvec_t convert_bool() const { return *this != IV(I(0)); } - realvec_t as_float() const; // defined after realbuiltinvec - realvec_t convert_float() const; // defined after realbuiltinvec - - - - intvec_t operator+() const { return mkvec(+v); } - intvec_t operator-() const { return mkvec(-v); } - - intvec_t operator+(intvec_t x) const { return mkvec(v + x.v); } - intvec_t operator-(intvec_t x) const { return mkvec(v - x.v); } - intvec_t operator*(intvec_t x) const { return mkvec(v * x.v); } - intvec_t operator/(intvec_t x) const { return mkvec(v / x.v); } - intvec_t operator%(intvec_t x) const { return mkvec(v % x.v); } - - intvec_t& operator+=(const intvec_t& x) { return *this=*this+x; } - intvec_t& operator-=(const intvec_t& x) { return *this=*this-x; } - intvec_t& operator*=(const intvec_t& x) { return *this=*this*x; } - intvec_t& operator/=(const intvec_t& x) { return *this=*this/x; } - intvec_t& operator%=(const intvec_t& x) { return *this=*this%x; } - - - - intvec_t operator~() const { return mkvec(~v); } - - intvec_t operator&(intvec_t x) const { return mkvec(v & x.v); } - intvec_t operator|(intvec_t x) const { return mkvec(v | x.v); } - intvec_t operator^(intvec_t x) const { return mkvec(v ^ x.v); } - - intvec_t& operator&=(const intvec_t& x) { return *this=*this&x; } - intvec_t& operator|=(const intvec_t& x) { return *this=*this|x; } - intvec_t& operator^=(const intvec_t& x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - - - intvec_t lsr(int_t n) const - { - return mkvec(ivector_t(uvector_t(v) >> U(n))); - } - intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); } - intvec_t operator>>(int_t n) const { return mkvec(v >> n); } - intvec_t operator<<(int_t n) const { return mkvec(v << n); } - - intvec_t& operator>>=(int_t n) { return *this=*this>>n; } - intvec_t& operator<<=(int_t n) { return *this=*this<<n; } - - intvec_t lsr(intvec_t n) const - { - return mkvec(ivector_t(uvector_t(v)>>uvector_t(n.v))); - } - intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); } - intvec_t operator>>(intvec_t n) const { return mkvec(v >> n.v); } - intvec_t operator<<(intvec_t n) const { return mkvec(v << n.v); } - - intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; } - intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; } - - intvec_t clz() const - { - intvec_t res; - for (int d=0; d<size; ++d) { - int_t val = (*this)[d]; - int_t cnt = val == 0 ? CHAR_BIT * sizeof val : builtin_clz(U(val)); - res.set_elt(d, cnt); - } - return res; - } - intvec_t popcount() const - { - intvec_t res; - for (int d=0; d<size; ++d) { - res.set_elt(d, builtin_popcount(U((*this)[d]))); - } - return res; - } - - - - boolvec_t operator==(const intvec_t& x) const - { - return boolvec_t::mkvec(v == x.v); - } - boolvec_t operator!=(const intvec_t& x) const - { - return boolvec_t::mkvec(v != x.v); - } - boolvec_t operator<(const intvec_t& x) const - { - return boolvec_t::mkvec(v < x.v); - } - boolvec_t operator<=(const intvec_t& x) const - { - return boolvec_t::mkvec(v <= x.v); - } - boolvec_t operator>(const intvec_t& x) const - { - return boolvec_t::mkvec(v > x.v); - } - boolvec_t operator>=(const intvec_t& x) const - { - return boolvec_t::mkvec(v >= x.v); +template <typename T, int N> struct boolbuiltinvec : floatprops<T> { + typedef typename floatprops<T>::int_t int_t; + typedef typename floatprops<T>::uint_t uint_t; + typedef typename floatprops<T>::real_t real_t; + + static const int size = N; + typedef bool scalar_t; + typedef int_t bvector_t __attribute__((__ext_vector_type__(N))); + static const int alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + +private: + // true is -1, false is 0 + static int_t from_bool(bool a) { return -uint_t(a); } + static bool to_bool(int_t a) { return a; } + +public: + typedef boolbuiltinvec boolvec_t; + typedef intbuiltinvec<real_t, size> intvec_t; + typedef realbuiltinvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolbuiltinvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolbuiltinvec(const boolbuiltinvec& x): v(x.v) {} + // boolbuiltinvec& operator=(const boolbuiltinvec& x) { return v=x.v, *this; } + // Can't have a constructor from bvector_t, since this would + // conflict with the constructor from bool + // boolbuiltinvec(bvector_t x): v(x) {} + static boolvec_t mkvec(bvector_t x) { + boolvec_t res; + res.v = x; + return res; + } + boolbuiltinvec(bool a) : v(from_bool(a)) {} + boolbuiltinvec(const bool *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + + operator bvector_t() const { return v; } + bool operator[](int n) const { return to_bool(v[n]); } + boolvec_t &set_elt(int n, bool a) { return v[n] = from_bool(a), *this; } + + intvec_t as_int() const; // defined after intbuiltinvec + intvec_t convert_int() const; // defined after intbuiltinvec + + boolvec_t operator!() const { return mkvec(!v); } + + boolvec_t operator&&(boolvec_t x) const { return mkvec(v && x.v); } + boolvec_t operator||(boolvec_t x) const { return mkvec(v || x.v); } + boolvec_t operator==(boolvec_t x) const { return mkvec(v == x.v); } + boolvec_t operator!=(boolvec_t x) const { return mkvec(v != x.v); } + + bool all() const { + bool res = (*this)[0]; + for (int d = 1; d < size; ++d) + res = res && (*this)[d]; + return res; + } + bool any() const { + bool res = (*this)[0]; + for (int d = 1; d < size; ++d) + res = res || (*this)[d]; + return res; + } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intbuiltinvec + realvec_t ifthen(realvec_t x, + realvec_t y) const; // defined after realbuiltinvec +}; + +template <typename T, int N> struct intbuiltinvec : floatprops<T> { + typedef typename floatprops<T>::int_t int_t; + typedef typename floatprops<T>::uint_t uint_t; + typedef typename floatprops<T>::real_t real_t; + + static const int size = N; + typedef int_t scalar_t; + typedef int_t ivector_t __attribute__((__ext_vector_type__(N))); + typedef uint_t uvector_t __attribute__((__ext_vector_type__(N))); + static const int alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + static_assert(size * sizeof(real_t) == sizeof(uvector_t), + "vector size is wrong"); + + typedef boolbuiltinvec<real_t, size> boolvec_t; + typedef intbuiltinvec intvec_t; + typedef realbuiltinvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intbuiltinvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intbuiltinvec(const intbuiltinvec& x): v(x.v) {} + // intbuiltinvec& operator=(const intbuiltinvec& x) { return v=x.v, *this; } + // Can't have a constructor from ivector_t, since this would + // conflict with the constructor from int_t + // intbuiltinvec(ivector_t x): v(x) {} + static intvec_t mkvec(ivector_t x) { + intvec_t res; + res.v = x; + return res; + } + intbuiltinvec(int_t a) : v(a) {} + intbuiltinvec(const int_t *as) { std::memcpy(&v, as, sizeof v); } + static intvec_t iota() { + intvec_t res; + for (int d = 0; d < size; ++d) + res.set_elt(d, d); + return res; + } + + int_t operator[](int n) const { return v[n]; } + intvec_t &set_elt(int n, int_t a) { return v[n] = a, *this; } + + boolvec_t as_bool() const { + boolvec_t res; + std::memcpy(&res.v, &v, sizeof res.v); + return res; + } + boolvec_t convert_bool() const { return *this != IV(I(0)); } + realvec_t as_float() const; // defined after realbuiltinvec + realvec_t convert_float() const; // defined after realbuiltinvec + + intvec_t operator+() const { return mkvec(+v); } + intvec_t operator-() const { return mkvec(-v); } + + intvec_t operator+(intvec_t x) const { return mkvec(v + x.v); } + intvec_t operator-(intvec_t x) const { return mkvec(v - x.v); } + intvec_t operator*(intvec_t x) const { return mkvec(v * x.v); } + intvec_t operator/(intvec_t x) const { return mkvec(v / x.v); } + intvec_t operator%(intvec_t x) const { return mkvec(v % x.v); } + + intvec_t &operator+=(const intvec_t &x) { return *this = *this + x; } + intvec_t &operator-=(const intvec_t &x) { return *this = *this - x; } + intvec_t &operator*=(const intvec_t &x) { return *this = *this * x; } + intvec_t &operator/=(const intvec_t &x) { return *this = *this / x; } + intvec_t &operator%=(const intvec_t &x) { return *this = *this % x; } + + intvec_t operator~() const { return mkvec(~v); } + + intvec_t operator&(intvec_t x) const { return mkvec(v & x.v); } + intvec_t operator|(intvec_t x) const { return mkvec(v | x.v); } + intvec_t operator^(intvec_t x) const { return mkvec(v ^ x.v); } + + intvec_t &operator&=(const intvec_t &x) { return *this = *this & x; } + intvec_t &operator|=(const intvec_t &x) { return *this = *this | x; } + intvec_t &operator^=(const intvec_t &x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); + } + + intvec_t lsr(int_t n) const { return mkvec(ivector_t(uvector_t(v) >> U(n))); } + intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); } + intvec_t operator>>(int_t n) const { return mkvec(v >> n); } + intvec_t operator<<(int_t n) const { return mkvec(v << n); } + + intvec_t &operator>>=(int_t n) { return *this = *this >> n; } + intvec_t &operator<<=(int_t n) { return *this = *this << n; } + + intvec_t lsr(intvec_t n) const { + return mkvec(ivector_t(uvector_t(v) >> uvector_t(n.v))); + } + intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); } + intvec_t operator>>(intvec_t n) const { return mkvec(v >> n.v); } + intvec_t operator<<(intvec_t n) const { return mkvec(v << n.v); } + + intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; } + intvec_t &operator<<=(intvec_t n) { return *this = *this << n; } + + intvec_t clz() const { + intvec_t res; + for (int d = 0; d < size; ++d) { + int_t val = (*this)[d]; + int_t cnt = val == 0 ? CHAR_BIT * sizeof val : builtin_clz(U(val)); + res.set_elt(d, cnt); } - - intvec_t abs() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.set_elt(d, builtin_abs((*this)[d])); - return res; + return res; + } + intvec_t popcount() const { + intvec_t res; + for (int d = 0; d < size; ++d) { + res.set_elt(d, builtin_popcount(U((*this)[d]))); } - - boolvec_t isignbit() const { return MF::vml_isignbit(*this); } - - intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); } - intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); } - }; - - - - template<typename T, int N> - struct realbuiltinvec: floatprops<T> - { - typedef typename floatprops<T>::int_t int_t; - typedef typename floatprops<T>::uint_t uint_t; - typedef typename floatprops<T>::real_t real_t; - - static const int size = N; - typedef real_t scalar_t; - typedef real_t vector_t __attribute__((__ext_vector_type__(N))); - static const int alignment = sizeof(vector_t); - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - + return res; + } + + boolvec_t operator==(const intvec_t &x) const { + return boolvec_t::mkvec(v == x.v); + } + boolvec_t operator!=(const intvec_t &x) const { + return boolvec_t::mkvec(v != x.v); + } + boolvec_t operator<(const intvec_t &x) const { + return boolvec_t::mkvec(v < x.v); + } + boolvec_t operator<=(const intvec_t &x) const { + return boolvec_t::mkvec(v <= x.v); + } + boolvec_t operator>(const intvec_t &x) const { + return boolvec_t::mkvec(v > x.v); + } + boolvec_t operator>=(const intvec_t &x) const { + return boolvec_t::mkvec(v >= x.v); + } + + intvec_t abs() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.set_elt(d, builtin_abs((*this)[d])); + return res; + } + + boolvec_t isignbit() const { return MF::vml_isignbit(*this); } + + intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); } + intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); } +}; + +template <typename T, int N> struct realbuiltinvec : floatprops<T> { + typedef typename floatprops<T>::int_t int_t; + typedef typename floatprops<T>::uint_t uint_t; + typedef typename floatprops<T>::real_t real_t; + + static const int size = N; + typedef real_t scalar_t; + typedef real_t vector_t __attribute__((__ext_vector_type__(N))); + static const int alignment = sizeof(vector_t); + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + #ifndef VML_NO_IOSTREAM - static const char* name() - { - static std::string name_; - if (name_.empty()) { - std::stringstream buf; - buf << "<builtin:" << N << "*" << FP::name() << ">"; - name_ = buf.str(); - } - return name_.c_str(); + static const char *name() { + static std::string name_; + if (name_.empty()) { + std::stringstream buf; + buf << "<builtin:" << N << "*" << FP::name() << ">"; + name_ = buf.str(); } + return name_.c_str(); + } #endif - void barrier() { volatile vector_t x __attribute__((__unused__)) = v; } - - typedef boolbuiltinvec<real_t, size> boolvec_t; - typedef intbuiltinvec<real_t, size> intvec_t; - typedef realbuiltinvec realvec_t; - - private: - boolvec_t mapb(bool f(real_t)) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = f(v[d]); - return res; - } - intvec_t map(int_t f(real_t)) const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = f(v[d]); - return res; - } - realvec_t map(real_t f(real_t)) const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = f(v[d]); - return res; - } - realvec_t map(real_t f(real_t, int_t), intvec_t x) const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]); - return res; - } - realvec_t map(real_t f(real_t, int_t*), intvec_t* x) const - { - realvec_t res; - for (int d=0; d<size; ++d) { - int_t ix; - res.v[d] = f(v[d], &ix); - x->set_elt(d, ix); - } - return res; - } - realvec_t map(real_t f(real_t, real_t), realvec_t x) const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]); - return res; - } - realvec_t map(real_t f(real_t, real_t, real_t), - realvec_t x, realvec_t y) const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d], y.v[d]); - return res; + void barrier() { volatile vector_t x __attribute__((__unused__)) = v; } + + typedef boolbuiltinvec<real_t, size> boolvec_t; + typedef intbuiltinvec<real_t, size> intvec_t; + typedef realbuiltinvec realvec_t; + +private: + boolvec_t mapb(bool f(real_t)) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = f(v[d]); + return res; + } + intvec_t map(int_t f(real_t)) const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = f(v[d]); + return res; + } + realvec_t map(real_t f(real_t)) const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = f(v[d]); + return res; + } + realvec_t map(real_t f(real_t, int_t), intvec_t x) const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = f(v[d], x.v[d]); + return res; + } + realvec_t map(real_t f(real_t, int_t *), intvec_t *x) const { + realvec_t res; + for (int d = 0; d < size; ++d) { + int_t ix; + res.v[d] = f(v[d], &ix); + x->set_elt(d, ix); } - public: - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realbuiltinvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realbuiltinvec(const realbuiltinvec& x): v(x.v) {} - // realbuiltinvec& operator=(const realbuiltinvec& x) { return v=x.v, *this; } - // Can't have a constructor from vector_t, since this would - // conflict with the constructor from real_t - // realbuiltinvec(vector_t x): v(x) {} - static realvec_t mkvec(vector_t x) { realvec_t res; res.v=x; return res; } - realbuiltinvec(real_t a): v(a) {} - realbuiltinvec(const real_t* as) { std::memcpy(&v, as, sizeof v); } - - real_t operator[](int n) const { return v[n]; } - realvec_t& set_elt(int n, real_t a) { return v[n]=a, *this; } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(const real_t* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); + return res; + } + realvec_t map(real_t f(real_t, real_t), realvec_t x) const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = f(v[d], x.v[d]); + return res; + } + realvec_t map(real_t f(real_t, real_t, real_t), realvec_t x, + realvec_t y) const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = f(v[d], x.v[d], y.v[d]); + return res; + } + +public: + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realbuiltinvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realbuiltinvec(const realbuiltinvec& x): v(x.v) {} + // realbuiltinvec& operator=(const realbuiltinvec& x) { return v=x.v, *this; } + // Can't have a constructor from vector_t, since this would + // conflict with the constructor from real_t + // realbuiltinvec(vector_t x): v(x) {} + static realvec_t mkvec(vector_t x) { + realvec_t res; + res.v = x; + return res; + } + realbuiltinvec(real_t a) : v(a) {} + realbuiltinvec(const real_t *as) { std::memcpy(&v, as, sizeof v); } + + real_t operator[](int n) const { return v[n]; } + realvec_t &set_elt(int n, real_t a) { return v[n] = a, *this; } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(const real_t *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); #if __has_builtin(__builtin_assume_aligned) - p = (const real_t*)__builtin_assume_aligned(p, sizeof(realvec_t)); + p = (const real_t *)__builtin_assume_aligned(p, sizeof(realvec_t)); #endif - return mkvec(*(const vector_t*)p); - } - static realvec_t loadu(const real_t* p) - { - // return mkvec(*(const vector_t*)p); - realvec_t res; - for (int d=0; d<size; ++d) res.set_elt(d, p[d]); - return res; - // realvec_t res; - // memcpy(&res.v, p, sizeof res.v); - // return res; - } - static realvec_t loadu(const real_t* p, size_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loadu(p+ioff); - } - realvec_t loada(const real_t* p, const mask_t& m) const - { - return m.m.ifthen(loada(p), *this); - } - realvec_t loadu(const real_t* p, const mask_t& m) const - { - return m.m.ifthen(loadu(p), *this); - } - realvec_t loadu(const real_t* p, size_t ioff, const mask_t& m) const - { - return m.m.ifthen(loadu(p, ioff), *this); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); + return mkvec(*(const vector_t *)p); + } + static realvec_t loadu(const real_t *p) { + // return mkvec(*(const vector_t*)p); + realvec_t res; + for (int d = 0; d < size; ++d) + res.set_elt(d, p[d]); + return res; + // realvec_t res; + // memcpy(&res.v, p, sizeof res.v); + // return res; + } + static realvec_t loadu(const real_t *p, size_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loadu(p + ioff); + } + realvec_t loada(const real_t *p, const mask_t &m) const { + return m.m.ifthen(loada(p), *this); + } + realvec_t loadu(const real_t *p, const mask_t &m) const { + return m.m.ifthen(loadu(p), *this); + } + realvec_t loadu(const real_t *p, size_t ioff, const mask_t &m) const { + return m.m.ifthen(loadu(p, ioff), *this); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); #if __has_builtin(__builtin_assume_aligned) - p = (real_t*)__builtin_assume_aligned(p, sizeof(realvec_t)); + p = (real_t *)__builtin_assume_aligned(p, sizeof(realvec_t)); #endif - *(vector_t*)p = v; - } - void storeu(real_t* p) const - { - // *(vector_t*)p = v; - for (int d=0; d<size; ++d) p[d] = (*this)[d]; - // memcpy(p, &v, sizeof res.v); - } - void storeu(real_t* p, size_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storeu(p+ioff); - } - void storea(real_t* p, const mask_t& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storeu(p, m); - } - void storeu(real_t* p, const mask_t& m) const - { - for (int d=0; d<size; ++d) if (m.m[d]) p[d] = (*this)[d]; - } - void storeu(real_t* p, size_t ioff, const mask_t& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const - { - intvec_t res; - std::memcpy(&res.v, &v, sizeof res.v); - return res; - } - intvec_t convert_int() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.set_elt(d, int_t((*this)[d])); - return res; - } - - - - realvec_t operator+() const { return mkvec(+v); } - realvec_t operator-() const { return mkvec(-v); } - - realvec_t operator+(realvec_t x) const { return mkvec(v + x.v); } - realvec_t operator-(realvec_t x) const { return mkvec(v - x.v); } - realvec_t operator*(realvec_t x) const { return mkvec(v * x.v); } - realvec_t operator/(realvec_t x) const { return mkvec(v / x.v); } - - realvec_t& operator+=(const realvec_t& x) { return *this=*this+x; } - realvec_t& operator-=(const realvec_t& x) { return *this=*this-x; } - realvec_t& operator*=(const realvec_t& x) { return *this=*this*x; } - realvec_t& operator/=(const realvec_t& x) { return *this=*this/x; } - - real_t maxval() const - { - real_t res = v[0]; - for (int d=1; d<size; ++d) { - res = builtin_fmax(res, (*this)[d]); - } - return res; - } - real_t minval() const - { - real_t res = v[0]; - for (int d=1; d<size; ++d) { - res = builtin_fmin(res, (*this)[d]); - } - return res; - } - real_t prod() const - { - real_t res = (*this)[0]; - for (int d=1; d<size; ++d) res *= (*this)[d]; - return res; - } - real_t sum() const - { - real_t res = (*this)[0]; - for (int d=1; d<size; ++d) res += (*this)[d]; - return res; - } - - - - boolvec_t operator==(const realvec_t& x) const - { - return boolvec_t::mkvec(v == x.v); - } - boolvec_t operator!=(const realvec_t& x) const - { - return boolvec_t::mkvec(v != x.v); - } - boolvec_t operator<(const realvec_t& x) const - { - return boolvec_t::mkvec(v < x.v); - } - boolvec_t operator<=(const realvec_t& x) const - { - return boolvec_t::mkvec(v <= x.v); - } - boolvec_t operator>(const realvec_t& x) const - { - return boolvec_t::mkvec(v > x.v); - } - boolvec_t operator>=(const realvec_t& x) const - { - return boolvec_t::mkvec(v >= x.v); - } - - - - realvec_t acos() const { return map(builtin_acos); } - realvec_t acosh() const { return map(builtin_acosh); } - realvec_t asin() const { return map(builtin_asin); } - realvec_t asinh() const { return map(builtin_asinh); } - realvec_t atan() const { return map(builtin_atan); } - realvec_t atan2(realvec_t y) const { return map(builtin_atan2, y); } - realvec_t atanh() const { return map(builtin_atanh); } - realvec_t cbrt() const { return map(builtin_cbrt); } - realvec_t ceil() const { return map(builtin_ceil); } - realvec_t copysign(realvec_t y) const { return map(builtin_copysign, y); } - realvec_t cos() const { return map(builtin_cos); } - realvec_t cosh() const { return map(builtin_cosh); } - realvec_t exp() const { return map(builtin_exp); } - realvec_t exp10() const { return MF::vml_exp10(*this); } - realvec_t exp2() const { return map(builtin_exp2); } - realvec_t expm1() const { return map(builtin_expm1); } - realvec_t fabs() const { return map(builtin_fabs); } - realvec_t fdim(realvec_t y) const { return map(builtin_fdim, y); } - realvec_t floor() const { return map(builtin_floor); } - realvec_t fma(realvec_t y, realvec_t z) const - { - return map(builtin_fma, y, z); - } - realvec_t fmax(realvec_t y) const { return map(builtin_fmax, y); } - realvec_t fmin(realvec_t y) const { return map(builtin_fmin, y); } - realvec_t fmod(realvec_t y) const { return map(builtin_fmod, y); } - realvec_t frexp(intvec_t* r) const - { - realvec_t res; - intvec_t exp; - for (int d=0; d<size; ++d) { - real_t val = (*this)[d]; - int iexp; - res.set_elt(d, __builtin_frexp(val, &iexp)); - int_t jexp = int_t(iexp); - if (__builtin_isinf(val)) jexp = std::numeric_limits<int_t>::max(); - if (__builtin_isnan(val)) jexp = std::numeric_limits<int_t>::min(); - exp.set_elt(d, jexp); - } - *r = exp; - return res; - } - realvec_t hypot(realvec_t y) const { return map(builtin_hypot, y); } - intvec_t ilogb() const - { - intvec_t res; - for (int d=0; d<size; ++d) { - real_t val = (*this)[d]; - int iexp = __builtin_ilogb(val); - int_t jexp = int_t(iexp); - if (val == R(0.0)) jexp = std::numeric_limits<int_t>::min(); - if (__builtin_isinf(val)) jexp = std::numeric_limits<int_t>::max(); - if (__builtin_isnan(val)) jexp = std::numeric_limits<int_t>::min(); - res.set_elt(d, jexp); - } - return res; - } - boolvec_t isfinite() const - { - boolvec_t res; - for (int d=0; d<size; ++d) { - res.set_elt(d, builtin_isfinite((*this)[d]) != 0); - } - return res; - } - boolvec_t isinf() const - { - boolvec_t res; - for (int d=0; d<size; ++d) { - res.set_elt(d, builtin_isinf((*this)[d]) != 0); - } - return res; - } - boolvec_t isnan() const - { - boolvec_t res; - for (int d=0; d<size; ++d) { - res.set_elt(d, builtin_isnan((*this)[d]) != 0); - } - return res; - } - boolvec_t isnormal() const - { - boolvec_t res; - for (int d=0; d<size; ++d) { - res.set_elt(d, builtin_isnormal((*this)[d]) != 0); - } - return res; - } - realvec_t ldexp(int_t n) const - { - realvec_t res; - for (int d=0; d<size; ++d) { - res.set_elt(d, builtin_ldexp((*this)[d], int(n))); - } - return res; - } - realvec_t ldexp(intvec_t n) const - { - realvec_t res; - for (int d=0; d<size; ++d) { - res.set_elt(d, builtin_ldexp((*this)[d], int(n[d]))); - } - return res; + *(vector_t *)p = v; + } + void storeu(real_t *p) const { + // *(vector_t*)p = v; + for (int d = 0; d < size; ++d) + p[d] = (*this)[d]; + // memcpy(p, &v, sizeof res.v); + } + void storeu(real_t *p, size_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storeu(p + ioff); + } + void storea(real_t *p, const mask_t &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storeu(p, m); + } + void storeu(real_t *p, const mask_t &m) const { + for (int d = 0; d < size; ++d) + if (m.m[d]) + p[d] = (*this)[d]; + } + void storeu(real_t *p, size_t ioff, const mask_t &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storeu(p + ioff, m); + } + + intvec_t as_int() const { + intvec_t res; + std::memcpy(&res.v, &v, sizeof res.v); + return res; + } + intvec_t convert_int() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.set_elt(d, int_t((*this)[d])); + return res; + } + + realvec_t operator+() const { return mkvec(+v); } + realvec_t operator-() const { return mkvec(-v); } + + realvec_t operator+(realvec_t x) const { return mkvec(v + x.v); } + realvec_t operator-(realvec_t x) const { return mkvec(v - x.v); } + realvec_t operator*(realvec_t x) const { return mkvec(v * x.v); } + realvec_t operator/(realvec_t x) const { return mkvec(v / x.v); } + + realvec_t &operator+=(const realvec_t &x) { return *this = *this + x; } + realvec_t &operator-=(const realvec_t &x) { return *this = *this - x; } + realvec_t &operator*=(const realvec_t &x) { return *this = *this * x; } + realvec_t &operator/=(const realvec_t &x) { return *this = *this / x; } + + real_t maxval() const { + real_t res = v[0]; + for (int d = 1; d < size; ++d) { + res = builtin_fmax(res, (*this)[d]); } - realvec_t log() const { return map(builtin_log); } - realvec_t log10() const { return map(builtin_log10); } - realvec_t log1p() const { return map(builtin_log1p); } - realvec_t log2() const { return map(builtin_log2); } - intvec_t lrint() const - { - if (sizeof(int_t) <= sizeof(long)) { - return map(builtin_lrint); - } else if (sizeof(int_t) <= sizeof(long long)) { - return map(builtin_llrint); - } - __builtin_unreachable(); + return res; + } + real_t minval() const { + real_t res = v[0]; + for (int d = 1; d < size; ++d) { + res = builtin_fmin(res, (*this)[d]); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return MF::vml_mad(*this, y, z); + return res; + } + real_t prod() const { + real_t res = (*this)[0]; + for (int d = 1; d < size; ++d) + res *= (*this)[d]; + return res; + } + real_t sum() const { + real_t res = (*this)[0]; + for (int d = 1; d < size; ++d) + res += (*this)[d]; + return res; + } + + boolvec_t operator==(const realvec_t &x) const { + return boolvec_t::mkvec(v == x.v); + } + boolvec_t operator!=(const realvec_t &x) const { + return boolvec_t::mkvec(v != x.v); + } + boolvec_t operator<(const realvec_t &x) const { + return boolvec_t::mkvec(v < x.v); + } + boolvec_t operator<=(const realvec_t &x) const { + return boolvec_t::mkvec(v <= x.v); + } + boolvec_t operator>(const realvec_t &x) const { + return boolvec_t::mkvec(v > x.v); + } + boolvec_t operator>=(const realvec_t &x) const { + return boolvec_t::mkvec(v >= x.v); + } + + realvec_t acos() const { return map(builtin_acos); } + realvec_t acosh() const { return map(builtin_acosh); } + realvec_t asin() const { return map(builtin_asin); } + realvec_t asinh() const { return map(builtin_asinh); } + realvec_t atan() const { return map(builtin_atan); } + realvec_t atan2(realvec_t y) const { return map(builtin_atan2, y); } + realvec_t atanh() const { return map(builtin_atanh); } + realvec_t cbrt() const { return map(builtin_cbrt); } + realvec_t ceil() const { return map(builtin_ceil); } + realvec_t copysign(realvec_t y) const { return map(builtin_copysign, y); } + realvec_t cos() const { return map(builtin_cos); } + realvec_t cosh() const { return map(builtin_cosh); } + realvec_t exp() const { return map(builtin_exp); } + realvec_t exp10() const { return MF::vml_exp10(*this); } + realvec_t exp2() const { return map(builtin_exp2); } + realvec_t expm1() const { return map(builtin_expm1); } + realvec_t fabs() const { return map(builtin_fabs); } + realvec_t fdim(realvec_t y) const { return map(builtin_fdim, y); } + realvec_t floor() const { return map(builtin_floor); } + realvec_t fma(realvec_t y, realvec_t z) const { + return map(builtin_fma, y, z); + } + realvec_t fmax(realvec_t y) const { return map(builtin_fmax, y); } + realvec_t fmin(realvec_t y) const { return map(builtin_fmin, y); } + realvec_t fmod(realvec_t y) const { return map(builtin_fmod, y); } + realvec_t frexp(intvec_t *r) const { + realvec_t res; + intvec_t exp; + for (int d = 0; d < size; ++d) { + real_t val = (*this)[d]; + int iexp; + res.set_elt(d, __builtin_frexp(val, &iexp)); + int_t jexp = int_t(iexp); + if (__builtin_isinf(val)) + jexp = std::numeric_limits<int_t>::max(); + if (__builtin_isnan(val)) + jexp = std::numeric_limits<int_t>::min(); + exp.set_elt(d, jexp); + } + *r = exp; + return res; + } + realvec_t hypot(realvec_t y) const { return map(builtin_hypot, y); } + intvec_t ilogb() const { + intvec_t res; + for (int d = 0; d < size; ++d) { + real_t val = (*this)[d]; + int iexp = __builtin_ilogb(val); + int_t jexp = int_t(iexp); + if (val == R(0.0)) + jexp = std::numeric_limits<int_t>::min(); + if (__builtin_isinf(val)) + jexp = std::numeric_limits<int_t>::max(); + if (__builtin_isnan(val)) + jexp = std::numeric_limits<int_t>::min(); + res.set_elt(d, jexp); } - realvec_t nextafter(realvec_t y) const { return map(builtin_nextafter, y); } - realvec_t pow(realvec_t y) const { return map(builtin_pow, y); } - realvec_t rcp() const { return RV(1.0) / *this; } - realvec_t remainder(realvec_t y) const { return map(builtin_remainder, y); } - realvec_t rint() const { return map(builtin_rint); } - realvec_t round() const { return map(builtin_round); } - realvec_t rsqrt() const { return RV(1.0) / sqrt(); } - boolvec_t signbit() const - { - boolvec_t res; - for (int d=0; d<size; ++d) { - res.set_elt(d, builtin_signbit((*this)[d]) != 0); - } - return res; + return res; + } + boolvec_t isfinite() const { + boolvec_t res; + for (int d = 0; d < size; ++d) { + res.set_elt(d, builtin_isfinite((*this)[d]) != 0); } - realvec_t sin() const { return map(builtin_sin); } - realvec_t sinh() const { return map(builtin_sinh); } - realvec_t sqrt() const { return map(builtin_sqrt); } - realvec_t tan() const { return map(builtin_tan); } - realvec_t tanh() const { return map(builtin_tanh); } - realvec_t trunc() const { return map(builtin_trunc); } - }; - - - - // boolbuiltinvec definitions - - template<typename T, int N> - inline - typename boolbuiltinvec<T,N>::intvec_t boolbuiltinvec<T,N>::as_int() const - { - intvec_t res; - std::memcpy(&res.v, &v, sizeof res.v); return res; } - - template<typename T, int N> - inline - typename boolbuiltinvec<T,N>::intvec_t - boolbuiltinvec<T,N>::convert_int() const - { - return - as_int(); - } - - template<typename T, int N> - inline - typename boolbuiltinvec<T,N>::boolvec_t - boolbuiltinvec<T,N>::ifthen(boolvec_t x, boolvec_t y) const - { - // return v ? x.v : y.v; + boolvec_t isinf() const { boolvec_t res; - for (int d=0; d<size; ++d) res.set_elt(d, (*this)[d] ? x[d] : y[d]); + for (int d = 0; d < size; ++d) { + res.set_elt(d, builtin_isinf((*this)[d]) != 0); + } return res; } - - template<typename T, int N> - inline - typename boolbuiltinvec<T,N>::intvec_t - boolbuiltinvec<T,N>::ifthen(intvec_t x, intvec_t y) const - { - // return v ? x.v : y.v; - intvec_t res; - for (int d=0; d<size; ++d) res.set_elt(d, (*this)[d] ? x[d] : y[d]); + boolvec_t isnan() const { + boolvec_t res; + for (int d = 0; d < size; ++d) { + res.set_elt(d, builtin_isnan((*this)[d]) != 0); + } return res; } - - template<typename T, int N> - inline - typename boolbuiltinvec<T,N>::realvec_t - boolbuiltinvec<T,N>::ifthen(realvec_t x, realvec_t y) const - { - // return v ? x.v : y.v; - realvec_t res; - for (int d=0; d<size; ++d) res.set_elt(d, (*this)[d] ? x[d] : y[d]); + boolvec_t isnormal() const { + boolvec_t res; + for (int d = 0; d < size; ++d) { + res.set_elt(d, builtin_isnormal((*this)[d]) != 0); + } return res; } - - - - // intbuiltinvec definitions - - template<typename T, int N> - inline - typename intbuiltinvec<T,N>::realvec_t intbuiltinvec<T,N>::as_float() const - { + realvec_t ldexp(int_t n) const { realvec_t res; - std::memcpy(&res.v, &v, sizeof res.v); + for (int d = 0; d < size; ++d) { + res.set_elt(d, builtin_ldexp((*this)[d], int(n))); + } return res; } - - template<typename T, int N> - inline - typename intbuiltinvec<T,N>::realvec_t - intbuiltinvec<T,N>::convert_float() const - { + realvec_t ldexp(intvec_t n) const { realvec_t res; - for (int d=0; d<size; ++d) res.set_elt(d, real_t((*this)[d])); + for (int d = 0; d < size; ++d) { + res.set_elt(d, builtin_ldexp((*this)[d], int(n[d]))); + } return res; } - - - - // Wrappers - - // boolbuiltinvec wrappers - - template<typename real_t, int size> - inline - intbuiltinvec<real_t, size> as_int(boolbuiltinvec<real_t, size> x) - { - return x.as_int(); - } - - template<typename real_t, int size> - inline - intbuiltinvec<real_t, size> convert_int(boolbuiltinvec<real_t, size> x) - { - return x.convert_int(); - } - - template<typename real_t, int size> - inline bool all(boolbuiltinvec<real_t, size> x) { return x.all(); } - - template<typename real_t, int size> - inline bool any(boolbuiltinvec<real_t, size> x) { return x.any(); } - - template<typename real_t, int size> - inline - boolbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c, - boolbuiltinvec<real_t, size> x, - boolbuiltinvec<real_t, size> y) - { - return c.ifthen(x, y); - } - - template<typename real_t, int size> - inline - intbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c, - intbuiltinvec<real_t, size> x, - intbuiltinvec<real_t, size> y) - { - return c.ifthen(x, y); - } - - template<typename real_t, int size> - inline - realbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c, - realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y) - { - return c.ifthen(x, y); - } - - - - // intbuiltinvec wrappers - - template<typename real_t, int size> - inline intbuiltinvec<real_t, size> abs(intbuiltinvec<real_t, size> x) - { - return x.abs(); - } - - template<typename real_t, int size> - inline boolbuiltinvec<real_t, size> as_bool(intbuiltinvec<real_t, size> x) - { - return x.as_bool(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> as_float(intbuiltinvec<real_t, size> x) - { - return x.as_float(); - } - - template<typename real_t, int size> - inline - intbuiltinvec<real_t, size> bitifthen(intbuiltinvec<real_t, size> x, - intbuiltinvec<real_t, size> y, - intbuiltinvec<real_t, size> z) - { - return x.bitifthen(y, z); - } - - template<typename real_t, int size> - inline intbuiltinvec<real_t, size> clz(intbuiltinvec<real_t, size> x) - { - return x.clz(); - } - - template<typename real_t, int size> - inline boolbuiltinvec<real_t, size> convert_bool(intbuiltinvec<real_t, size> x) - { - return x.convert_bool(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> convert_float(intbuiltinvec<real_t, size> x) - { - return x.convert_float(); - } - - template<typename real_t, int size> - inline boolbuiltinvec<real_t, size> isignbit(intbuiltinvec<real_t, size> x) - { - return x.isignbit(); - } - - template<typename real_t, int size> - inline - intbuiltinvec<real_t, size> lsr(intbuiltinvec<real_t, size> x, - typename intbuiltinvec<real_t, size>::int_t n) - { - return x.lsr(n); - } - - template<typename real_t, int size> - inline - intbuiltinvec<real_t, size> lsr(intbuiltinvec<real_t, size> x, - intbuiltinvec<real_t, size> n) - { - return x.lsr(n); - } - - template<typename real_t, int size> - inline - intbuiltinvec<real_t, size> max(intbuiltinvec<real_t, size> x, - intbuiltinvec<real_t, size> y) - { - return x.max(y); - } - - template<typename real_t, int size> - inline - intbuiltinvec<real_t, size> min(intbuiltinvec<real_t, size> x, - intbuiltinvec<real_t, size> y) - { - return x.min(y); - } - - template<typename real_t, int size> - inline - intbuiltinvec<real_t, size> popcount(intbuiltinvec<real_t, size> x) - { - return x.popcount(); - } - - template<typename real_t, int size> - inline - intbuiltinvec<real_t, size> - rotate(intbuiltinvec<real_t, size> x, - typename intbuiltinvec<real_t, size>::int_t n) - { - return x.rotate(n); - } - - template<typename real_t, int size> - inline - intbuiltinvec<real_t, size> rotate(intbuiltinvec<real_t, size> x, - intbuiltinvec<real_t, size> n) - { - return x.rotate(n); - } - - - - // realbuiltinvec wrappers - - template<typename real_t, int size> - inline - realbuiltinvec<real_t, size> - loada(real_t const* p, - realbuiltinvec<real_t, size> x, - typename realbuiltinvec<real_t, size>::mask_t const& m) - { - return x.loada(p, m); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> - loadu(real_t const* p, - realbuiltinvec<real_t, size> x, - typename realbuiltinvec<real_t, size>::mask_t const& m) - { - return x.loadu(p, m); - } - - template<typename real_t, int size> - inline - realbuiltinvec<real_t, size> - loadu(real_t const* p, size_t ioff, - realbuiltinvec<real_t, size> x, - typename realbuiltinvec<real_t, size>::mask_t const& m) - { - return x.loadu(p, ioff, m); - } - - template<typename real_t, int size> - inline void storea(realbuiltinvec<real_t, size> x, real_t* p) - { - return x.storea(p); - } - - template<typename real_t, int size> - inline void storeu(realbuiltinvec<real_t, size> x, real_t* p) - { - return x.storeu(p); - } - - template<typename real_t, int size> - inline void storeu(realbuiltinvec<real_t, size> x, real_t* p, size_t ioff) - { - return x.storeu(p, ioff); - } - - template<typename real_t, int size> - inline void storea(realbuiltinvec<real_t, size> x, real_t* p, - typename realbuiltinvec<real_t, size>::mask_t const& m) - { - return x.storea(p, m); - } - - template<typename real_t, int size> - inline void storeu(realbuiltinvec<real_t, size> x, real_t* p, - typename realbuiltinvec<real_t, size>::mask_t const& m) - { - return x.storeu(p, m); - } - - template<typename real_t, int size> - inline void storeu(realbuiltinvec<real_t, size> x, real_t* p, size_t ioff, - typename realbuiltinvec<real_t, size>::mask_t const& m) - { - return x.storeu(p, ioff, m); - } - - - - template<typename real_t, int size> - inline intbuiltinvec<real_t, size> as_int(realbuiltinvec<real_t, size> x) - { - return x.as_int(); - } - - template<typename real_t, int size> - inline intbuiltinvec<real_t, size> convert_int(realbuiltinvec<real_t, size> x) - { - return x.convert_int(); - } - - template<typename real_t, int size> - inline real_t maxval(realbuiltinvec<real_t, size> x) - { - return x.maxval(); - } - - template<typename real_t, int size> - inline real_t minval(realbuiltinvec<real_t, size> x) - { - return x.minval(); - } - - template<typename real_t, int size> - inline real_t prod(realbuiltinvec<real_t, size> x) - { - return x.prod(); - } - - template<typename real_t, int size> - inline real_t sum(realbuiltinvec<real_t, size> x) - { - return x.sum(); - } - - - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> acos(realbuiltinvec<real_t, size> x) - { - return x.acos(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> acosh(realbuiltinvec<real_t, size> x) - { - return x.acosh(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> asin(realbuiltinvec<real_t, size> x) - { - return x.asin(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> asinh(realbuiltinvec<real_t, size> x) - { - return x.asinh(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> atan(realbuiltinvec<real_t, size> x) - { - return x.atan(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> atan2(realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y) - { - return x.atan2(y); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> atanh(realbuiltinvec<real_t, size> x) - { - return x.atanh(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> cbrt(realbuiltinvec<real_t, size> x) - { - return x.cbrt(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> ceil(realbuiltinvec<real_t, size> x) - { - return x.ceil(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> copysign(realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y) - { - return x.copysign(y); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> cos(realbuiltinvec<real_t, size> x) - { - return x.cos(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> cosh(realbuiltinvec<real_t, size> x) - { - return x.cosh(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> exp(realbuiltinvec<real_t, size> x) - { - return x.exp(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> exp10(realbuiltinvec<real_t, size> x) - { - return x.exp10(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> exp2(realbuiltinvec<real_t, size> x) - { - return x.exp2(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> expm1(realbuiltinvec<real_t, size> x) - { - return x.expm1(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> fabs(realbuiltinvec<real_t, size> x) - { - return x.fabs(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> floor(realbuiltinvec<real_t, size> x) - { - return x.floor(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> fdim(realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y) - { - return x.fdim(y); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> fma(realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y, - realbuiltinvec<real_t, size> z) - { - return x.fma(y, z); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> fmax(realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y) - { - return x.fmax(y); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> fmin(realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y) - { - return x.fmin(y); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> fmod(realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y) - { - return x.fmod(y); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> frexp(realbuiltinvec<real_t, size> x, - intbuiltinvec<real_t, size>* r) - { - return x.frexp(r); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> hypot(realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y) - { - return x.hypot(y); - } - - template<typename real_t, int size> - inline intbuiltinvec<real_t, size> ilogb(realbuiltinvec<real_t, size> x) - { - return x.ilogb(); - } - - template<typename real_t, int size> - inline boolbuiltinvec<real_t, size> isfinite(realbuiltinvec<real_t, size> x) - { - return x.isfinite(); - } - - template<typename real_t, int size> - inline boolbuiltinvec<real_t, size> isinf(realbuiltinvec<real_t, size> x) - { - return x.isinf(); - } - - template<typename real_t, int size> - inline boolbuiltinvec<real_t, size> isnan(realbuiltinvec<real_t, size> x) - { - return x.isnan(); - } - - template<typename real_t, int size> - inline boolbuiltinvec<real_t, size> isnormal(realbuiltinvec<real_t, size> x) - { - return x.isnormal(); - } - - template<typename real_t, int size> - inline - realbuiltinvec<real_t, size> - ldexp(realbuiltinvec<real_t, size> x, - typename intbuiltinvec<real_t, size>::int_t n) - { - return x.ldexp(n); - } - - template<typename real_t, int size> - inline - realbuiltinvec<real_t, size> ldexp(realbuiltinvec<real_t, size> x, - intbuiltinvec<real_t, size> n) - { - return x.ldexp(n); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> log(realbuiltinvec<real_t, size> x) - { - return x.log(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> log10(realbuiltinvec<real_t, size> x) - { - return x.log10(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> log1p(realbuiltinvec<real_t, size> x) - { - return x.log1p(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> log2(realbuiltinvec<real_t, size> x) - { - return x.log2(); - } - - template<typename real_t, int size> - inline intbuiltinvec<real_t, size> lrint(realbuiltinvec<real_t, size> x) - { - return x.lrint(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> mad(realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y, - realbuiltinvec<real_t, size> z) - { - return x.mad(y, z); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> nextafter(realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y) - { - return x.nextafter(y); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> pow(realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y) - { - return x.pow(y); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> rcp(realbuiltinvec<real_t, size> x) - { - return x.rcp(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> remainder(realbuiltinvec<real_t, size> x, - realbuiltinvec<real_t, size> y) - { - return x.remainder(y); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> rint(realbuiltinvec<real_t, size> x) - { - return x.rint(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> round(realbuiltinvec<real_t, size> x) - { - return x.round(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> rsqrt(realbuiltinvec<real_t, size> x) - { - return x.rsqrt(); - } - - template<typename real_t, int size> - inline boolbuiltinvec<real_t, size> signbit(realbuiltinvec<real_t, size> x) - { - return x.signbit(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> sin(realbuiltinvec<real_t, size> x) - { - return x.sin(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> sinh(realbuiltinvec<real_t, size> x) - { - return x.sinh(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> sqrt(realbuiltinvec<real_t, size> x) - { - return x.sqrt(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> tan(realbuiltinvec<real_t, size> x) - { - return x.tan(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> tanh(realbuiltinvec<real_t, size> x) - { - return x.tanh(); - } - - template<typename real_t, int size> - inline realbuiltinvec<real_t, size> trunc(realbuiltinvec<real_t, size> x) - { - return x.trunc(); - } - - - -#ifndef VML_NO_IOSTREAM - template<typename real_t, int size> - std::ostream& operator<<(std::ostream& os, - boolbuiltinvec<real_t, size> const& x) - { - os << "["; - for (int i=0; i<size; ++i) { - if (i!=0) os << ","; - os << x[i]; - } - os << "]"; - return os; - } - - template<typename real_t, int size> - std::ostream& operator<<(std::ostream& os, - intbuiltinvec<real_t, size> const& x) - { - os << "["; - for (int i=0; i<size; ++i) { - if (i!=0) os << ","; - os << x[i]; - } - os << "]"; - return os; - } - - template<typename real_t, int size> - std::ostream& operator<<(std::ostream& os, - realbuiltinvec<real_t, size> const& x) - { - os << "["; - for (int i=0; i<size; ++i) { - if (i!=0) os << ","; - os << x[i]; + realvec_t log() const { return map(builtin_log); } + realvec_t log10() const { return map(builtin_log10); } + realvec_t log1p() const { return map(builtin_log1p); } + realvec_t log2() const { return map(builtin_log2); } + intvec_t lrint() const { + if (sizeof(int_t) <= sizeof(long)) { + return map(builtin_lrint); + } else if (sizeof(int_t) <= sizeof(long long)) { + return map(builtin_llrint); + } + __builtin_unreachable(); + } + realvec_t mad(realvec_t y, realvec_t z) const { + return MF::vml_mad(*this, y, z); + } + realvec_t nextafter(realvec_t y) const { return map(builtin_nextafter, y); } + realvec_t pow(realvec_t y) const { return map(builtin_pow, y); } + realvec_t rcp() const { return RV(1.0) / *this; } + realvec_t remainder(realvec_t y) const { return map(builtin_remainder, y); } + realvec_t rint() const { return map(builtin_rint); } + realvec_t round() const { return map(builtin_round); } + realvec_t rsqrt() const { return RV(1.0) / sqrt(); } + boolvec_t signbit() const { + boolvec_t res; + for (int d = 0; d < size; ++d) { + res.set_elt(d, builtin_signbit((*this)[d]) != 0); } - os << "]"; - return os; + return res; } + realvec_t sin() const { return map(builtin_sin); } + realvec_t sinh() const { return map(builtin_sinh); } + realvec_t sqrt() const { return map(builtin_sqrt); } + realvec_t tan() const { return map(builtin_tan); } + realvec_t tanh() const { return map(builtin_tanh); } + realvec_t trunc() const { return map(builtin_trunc); } +}; + +// boolbuiltinvec definitions + +template <typename T, int N> +inline typename boolbuiltinvec<T, N>::intvec_t +boolbuiltinvec<T, N>::as_int() const { + intvec_t res; + std::memcpy(&res.v, &v, sizeof res.v); + return res; +} + +template <typename T, int N> +inline typename boolbuiltinvec<T, N>::intvec_t +boolbuiltinvec<T, N>::convert_int() const { + return -as_int(); +} + +template <typename T, int N> +inline typename boolbuiltinvec<T, N>::boolvec_t +boolbuiltinvec<T, N>::ifthen(boolvec_t x, boolvec_t y) const { + // return v ? x.v : y.v; + boolvec_t res; + for (int d = 0; d < size; ++d) + res.set_elt(d, (*this)[d] ? x[d] : y[d]); + return res; +} + +template <typename T, int N> +inline typename boolbuiltinvec<T, N>::intvec_t +boolbuiltinvec<T, N>::ifthen(intvec_t x, intvec_t y) const { + // return v ? x.v : y.v; + intvec_t res; + for (int d = 0; d < size; ++d) + res.set_elt(d, (*this)[d] ? x[d] : y[d]); + return res; +} + +template <typename T, int N> +inline typename boolbuiltinvec<T, N>::realvec_t +boolbuiltinvec<T, N>::ifthen(realvec_t x, realvec_t y) const { + // return v ? x.v : y.v; + realvec_t res; + for (int d = 0; d < size; ++d) + res.set_elt(d, (*this)[d] ? x[d] : y[d]); + return res; +} + +// intbuiltinvec definitions + +template <typename T, int N> +inline typename intbuiltinvec<T, N>::realvec_t +intbuiltinvec<T, N>::as_float() const { + realvec_t res; + std::memcpy(&res.v, &v, sizeof res.v); + return res; +} + +template <typename T, int N> +inline typename intbuiltinvec<T, N>::realvec_t +intbuiltinvec<T, N>::convert_float() const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.set_elt(d, real_t((*this)[d])); + return res; +} + +// Wrappers + +// boolbuiltinvec wrappers + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> as_int(boolbuiltinvec<real_t, size> x) { + return x.as_int(); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> convert_int(boolbuiltinvec<real_t, size> x) { + return x.convert_int(); +} + +template <typename real_t, int size> +inline bool all(boolbuiltinvec<real_t, size> x) { + return x.all(); +} + +template <typename real_t, int size> +inline bool any(boolbuiltinvec<real_t, size> x) { + return x.any(); +} + +template <typename real_t, int size> +inline boolbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c, + boolbuiltinvec<real_t, size> x, + boolbuiltinvec<real_t, size> y) { + return c.ifthen(x, y); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c, + intbuiltinvec<real_t, size> x, + intbuiltinvec<real_t, size> y) { + return c.ifthen(x, y); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> ifthen(boolbuiltinvec<real_t, size> c, + realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y) { + return c.ifthen(x, y); +} + +// intbuiltinvec wrappers + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> abs(intbuiltinvec<real_t, size> x) { + return x.abs(); +} + +template <typename real_t, int size> +inline boolbuiltinvec<real_t, size> as_bool(intbuiltinvec<real_t, size> x) { + return x.as_bool(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> as_float(intbuiltinvec<real_t, size> x) { + return x.as_float(); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> bitifthen(intbuiltinvec<real_t, size> x, + intbuiltinvec<real_t, size> y, + intbuiltinvec<real_t, size> z) { + return x.bitifthen(y, z); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> clz(intbuiltinvec<real_t, size> x) { + return x.clz(); +} + +template <typename real_t, int size> +inline boolbuiltinvec<real_t, size> +convert_bool(intbuiltinvec<real_t, size> x) { + return x.convert_bool(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> +convert_float(intbuiltinvec<real_t, size> x) { + return x.convert_float(); +} + +template <typename real_t, int size> +inline boolbuiltinvec<real_t, size> isignbit(intbuiltinvec<real_t, size> x) { + return x.isignbit(); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> +lsr(intbuiltinvec<real_t, size> x, + typename intbuiltinvec<real_t, size>::int_t n) { + return x.lsr(n); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> lsr(intbuiltinvec<real_t, size> x, + intbuiltinvec<real_t, size> n) { + return x.lsr(n); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> max(intbuiltinvec<real_t, size> x, + intbuiltinvec<real_t, size> y) { + return x.max(y); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> min(intbuiltinvec<real_t, size> x, + intbuiltinvec<real_t, size> y) { + return x.min(y); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> popcount(intbuiltinvec<real_t, size> x) { + return x.popcount(); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> +rotate(intbuiltinvec<real_t, size> x, + typename intbuiltinvec<real_t, size>::int_t n) { + return x.rotate(n); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> rotate(intbuiltinvec<real_t, size> x, + intbuiltinvec<real_t, size> n) { + return x.rotate(n); +} + +// realbuiltinvec wrappers + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> +loada(real_t const *p, realbuiltinvec<real_t, size> x, + typename realbuiltinvec<real_t, size>::mask_t const &m) { + return x.loada(p, m); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> +loadu(real_t const *p, realbuiltinvec<real_t, size> x, + typename realbuiltinvec<real_t, size>::mask_t const &m) { + return x.loadu(p, m); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> +loadu(real_t const *p, size_t ioff, realbuiltinvec<real_t, size> x, + typename realbuiltinvec<real_t, size>::mask_t const &m) { + return x.loadu(p, ioff, m); +} + +template <typename real_t, int size> +inline void storea(realbuiltinvec<real_t, size> x, real_t *p) { + return x.storea(p); +} + +template <typename real_t, int size> +inline void storeu(realbuiltinvec<real_t, size> x, real_t *p) { + return x.storeu(p); +} + +template <typename real_t, int size> +inline void storeu(realbuiltinvec<real_t, size> x, real_t *p, size_t ioff) { + return x.storeu(p, ioff); +} + +template <typename real_t, int size> +inline void storea(realbuiltinvec<real_t, size> x, real_t *p, + typename realbuiltinvec<real_t, size>::mask_t const &m) { + return x.storea(p, m); +} + +template <typename real_t, int size> +inline void storeu(realbuiltinvec<real_t, size> x, real_t *p, + typename realbuiltinvec<real_t, size>::mask_t const &m) { + return x.storeu(p, m); +} + +template <typename real_t, int size> +inline void storeu(realbuiltinvec<real_t, size> x, real_t *p, size_t ioff, + typename realbuiltinvec<real_t, size>::mask_t const &m) { + return x.storeu(p, ioff, m); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> as_int(realbuiltinvec<real_t, size> x) { + return x.as_int(); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> convert_int(realbuiltinvec<real_t, size> x) { + return x.convert_int(); +} + +template <typename real_t, int size> +inline real_t maxval(realbuiltinvec<real_t, size> x) { + return x.maxval(); +} + +template <typename real_t, int size> +inline real_t minval(realbuiltinvec<real_t, size> x) { + return x.minval(); +} + +template <typename real_t, int size> +inline real_t prod(realbuiltinvec<real_t, size> x) { + return x.prod(); +} + +template <typename real_t, int size> +inline real_t sum(realbuiltinvec<real_t, size> x) { + return x.sum(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> acos(realbuiltinvec<real_t, size> x) { + return x.acos(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> acosh(realbuiltinvec<real_t, size> x) { + return x.acosh(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> asin(realbuiltinvec<real_t, size> x) { + return x.asin(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> asinh(realbuiltinvec<real_t, size> x) { + return x.asinh(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> atan(realbuiltinvec<real_t, size> x) { + return x.atan(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> atan2(realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y) { + return x.atan2(y); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> atanh(realbuiltinvec<real_t, size> x) { + return x.atanh(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> cbrt(realbuiltinvec<real_t, size> x) { + return x.cbrt(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> ceil(realbuiltinvec<real_t, size> x) { + return x.ceil(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> copysign(realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y) { + return x.copysign(y); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> cos(realbuiltinvec<real_t, size> x) { + return x.cos(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> cosh(realbuiltinvec<real_t, size> x) { + return x.cosh(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> exp(realbuiltinvec<real_t, size> x) { + return x.exp(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> exp10(realbuiltinvec<real_t, size> x) { + return x.exp10(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> exp2(realbuiltinvec<real_t, size> x) { + return x.exp2(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> expm1(realbuiltinvec<real_t, size> x) { + return x.expm1(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> fabs(realbuiltinvec<real_t, size> x) { + return x.fabs(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> floor(realbuiltinvec<real_t, size> x) { + return x.floor(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> fdim(realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y) { + return x.fdim(y); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> fma(realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y, + realbuiltinvec<real_t, size> z) { + return x.fma(y, z); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> fmax(realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y) { + return x.fmax(y); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> fmin(realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y) { + return x.fmin(y); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> fmod(realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y) { + return x.fmod(y); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> frexp(realbuiltinvec<real_t, size> x, + intbuiltinvec<real_t, size> *r) { + return x.frexp(r); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> hypot(realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y) { + return x.hypot(y); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> ilogb(realbuiltinvec<real_t, size> x) { + return x.ilogb(); +} + +template <typename real_t, int size> +inline boolbuiltinvec<real_t, size> isfinite(realbuiltinvec<real_t, size> x) { + return x.isfinite(); +} + +template <typename real_t, int size> +inline boolbuiltinvec<real_t, size> isinf(realbuiltinvec<real_t, size> x) { + return x.isinf(); +} + +template <typename real_t, int size> +inline boolbuiltinvec<real_t, size> isnan(realbuiltinvec<real_t, size> x) { + return x.isnan(); +} + +template <typename real_t, int size> +inline boolbuiltinvec<real_t, size> isnormal(realbuiltinvec<real_t, size> x) { + return x.isnormal(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> +ldexp(realbuiltinvec<real_t, size> x, + typename intbuiltinvec<real_t, size>::int_t n) { + return x.ldexp(n); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> ldexp(realbuiltinvec<real_t, size> x, + intbuiltinvec<real_t, size> n) { + return x.ldexp(n); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> log(realbuiltinvec<real_t, size> x) { + return x.log(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> log10(realbuiltinvec<real_t, size> x) { + return x.log10(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> log1p(realbuiltinvec<real_t, size> x) { + return x.log1p(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> log2(realbuiltinvec<real_t, size> x) { + return x.log2(); +} + +template <typename real_t, int size> +inline intbuiltinvec<real_t, size> lrint(realbuiltinvec<real_t, size> x) { + return x.lrint(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> mad(realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y, + realbuiltinvec<real_t, size> z) { + return x.mad(y, z); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> nextafter(realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y) { + return x.nextafter(y); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> pow(realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y) { + return x.pow(y); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> rcp(realbuiltinvec<real_t, size> x) { + return x.rcp(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> remainder(realbuiltinvec<real_t, size> x, + realbuiltinvec<real_t, size> y) { + return x.remainder(y); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> rint(realbuiltinvec<real_t, size> x) { + return x.rint(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> round(realbuiltinvec<real_t, size> x) { + return x.round(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> rsqrt(realbuiltinvec<real_t, size> x) { + return x.rsqrt(); +} + +template <typename real_t, int size> +inline boolbuiltinvec<real_t, size> signbit(realbuiltinvec<real_t, size> x) { + return x.signbit(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> sin(realbuiltinvec<real_t, size> x) { + return x.sin(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> sinh(realbuiltinvec<real_t, size> x) { + return x.sinh(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> sqrt(realbuiltinvec<real_t, size> x) { + return x.sqrt(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> tan(realbuiltinvec<real_t, size> x) { + return x.tan(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> tanh(realbuiltinvec<real_t, size> x) { + return x.tanh(); +} + +template <typename real_t, int size> +inline realbuiltinvec<real_t, size> trunc(realbuiltinvec<real_t, size> x) { + return x.trunc(); +} + +#ifndef VML_NO_IOSTREAM +template <typename real_t, int size> +std::ostream &operator<<(std::ostream &os, + boolbuiltinvec<real_t, size> const &x) { + os << "["; + for (int i = 0; i < size; ++i) { + if (i != 0) + os << ","; + os << x[i]; + } + os << "]"; + return os; +} + +template <typename real_t, int size> +std::ostream &operator<<(std::ostream &os, + intbuiltinvec<real_t, size> const &x) { + os << "["; + for (int i = 0; i < size; ++i) { + if (i != 0) + os << ","; + os << x[i]; + } + os << "]"; + return os; +} + +template <typename real_t, int size> +std::ostream &operator<<(std::ostream &os, + realbuiltinvec<real_t, size> const &x) { + os << "["; + for (int i = 0; i < size; ++i) { + if (i != 0) + os << ","; + os << x[i]; + } + os << "]"; + return os; +} #endif - + } // namespace vecmathlib -#endif // #ifndef VEC_BUILTIN_H +#endif // #ifndef VEC_BUILTIN_H @@ -5,74 +5,67 @@ #include <cstdlib> +namespace vecmathlib { +template <typename realvec_t> class mask_t { -namespace vecmathlib { - - template<typename realvec_t> - class mask_t { - - typedef typename realvec_t::boolvec_t boolvec_t; - typedef typename realvec_t::intvec_t intvec_t; - static const int size = realvec_t::size; - - public: - std::ptrdiff_t imin, imax; - std::ptrdiff_t i; - boolvec_t m; - bool all_m; - - public: - - // Construct a mask from a boolvec - mask_t(boolvec_t m_): m(m_), all_m(all(m)) {} - - // Construct a mask for a particular location i - mask_t(std::ptrdiff_t i_, - std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff): - imin(imin_), imax(imax_), i(i_) - { - all_m = i-imin >= 0 && i+size-1-imax < 0; - if (__builtin_expect(all_m, true)) { - m = true; - } else { - m = (! isignbit(intvec_t(i - imin) + intvec_t::iota()) && - isignbit(intvec_t(i + size-1 - imax) + intvec_t::iota())); - } + typedef typename realvec_t::boolvec_t boolvec_t; + typedef typename realvec_t::intvec_t intvec_t; + static const int size = realvec_t::size; + +public: + std::ptrdiff_t imin, imax; + std::ptrdiff_t i; + boolvec_t m; + bool all_m; + +public: + // Construct a mask from a boolvec + mask_t(boolvec_t m_) : m(m_), all_m(all(m)) {} + + // Construct a mask for a particular location i + mask_t(std::ptrdiff_t i_, std::ptrdiff_t imin_, std::ptrdiff_t imax_, + std::ptrdiff_t ioff) + : imin(imin_), imax(imax_), i(i_) { + all_m = i - imin >= 0 && i + size - 1 - imax < 0; + if (__builtin_expect(all_m, true)) { + m = true; + } else { + m = (!isignbit(intvec_t(i - imin) + intvec_t::iota()) && + isignbit(intvec_t(i + size - 1 - imax) + intvec_t::iota())); } - - // Construct a mask for a loop starting at imin, aligned down - mask_t(std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff): - imin(imin_), imax(imax_), i(imin_ - (ioff + imin_) % size) - { - all_m = i-imin >= 0 && i+size-1-imax < 0; - if (__builtin_expect(all_m, true)) { - m = true; - } else { - m = (! isignbit(intvec_t(i - imin) + intvec_t::iota()) && - isignbit(intvec_t(i + size-1 - imax) + intvec_t::iota())); - } + } + + // Construct a mask for a loop starting at imin, aligned down + mask_t(std::ptrdiff_t imin_, std::ptrdiff_t imax_, std::ptrdiff_t ioff) + : imin(imin_), imax(imax_), i(imin_ - (ioff + imin_) % size) { + all_m = i - imin >= 0 && i + size - 1 - imax < 0; + if (__builtin_expect(all_m, true)) { + m = true; + } else { + m = (!isignbit(intvec_t(i - imin) + intvec_t::iota()) && + isignbit(intvec_t(i + size - 1 - imax) + intvec_t::iota())); } - - // Get current index - std::ptrdiff_t index() const { return i; } - - // Looping condition - operator bool() const { return i<imax; } - - // Loop stepper - void operator++() - { - i += size; - all_m = i + size-1 - imax < 0; - if (__builtin_expect(all_m, true)) { - m = true; - } else { - m = isignbit(intvec_t(i + size-1 - imax) + intvec_t::iota()); - } + } + + // Get current index + std::ptrdiff_t index() const { return i; } + + // Looping condition + operator bool() const { return i < imax; } + + // Loop stepper + void operator++() { + i += size; + all_m = i + size - 1 - imax < 0; + if (__builtin_expect(all_m, true)) { + m = true; + } else { + m = isignbit(intvec_t(i + size - 1 - imax) + intvec_t::iota()); } - }; - + } +}; + } // namespace vecmathlib -#endif // #ifndef VEC_MASK_H +#endif // #ifndef VEC_MASK_H diff --git a/vec_mic_double8.h b/vec_mic_double8.h index 68dd5aa..ef22088 100644 --- a/vec_mic_double8.h +++ b/vec_mic_double8.h @@ -12,697 +12,585 @@ // MIC intrinsics #include <immintrin.h> - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_DOUBLE_8 - template<> struct boolvec<double,8>; - template<> struct intvec<double,8>; - template<> struct realvec<double,8>; - - - - template<> - struct boolvec<double,8>: floatprops<double> - { - static const int size = 8; - typedef bool scalar_t; - typedef __mask8 bvector_t; - static const int alignment = sizeof(bvector_t); - - // static_assert(size * sizeof(real_t) == sizeof(bvector_t), - // "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(const boolvec& x): v(x.v) {} - // boolvec& operator=(const boolvec& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(- bvector_t(a)) {} - boolvec(const bool* as): - v((bvector_t(as[0]) << 0) | - (bvector_t(as[1]) << 1) | - (bvector_t(as[2]) << 2) | - (bvector_t(as[3]) << 3) | - (bvector_t(as[4]) << 4) | - (bvector_t(as[5]) << 5) | - (bvector_t(as[6]) << 6) | - (bvector_t(as[7]) << 7)) - {} - - operator bvector_t() const { return v; } - bool operator[](int n) const - { - return (v >> n) & 1; - } - boolvec& set_elt(int n, bool a) - { - v &= ~ (bvector_t(1) << n); - v |= bvector_t(a) << n; - return *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return _mm512_knot(v); } - - boolvec operator&&(boolvec x) const { return _mm512_kand(v, x.v); } - boolvec operator||(boolvec x) const { return _mm512_kor(v, x.v); } - boolvec operator==(boolvec x) const { return _mm512_kxnor(v, x.v); } - boolvec operator!=(boolvec x) const { return _mm512_kxor(v, x.v); } - - bool all() const { return _mm512_kortestc(v, v); } - bool any() const { return ! bool(_mm512_kortestz(v, v)); } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<double,8>: floatprops<double> - { - static const int size = 8; - typedef int_t scalar_t; - typedef __m512i ivector_t; - static const int alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(const intvec& x): v(x.v) {} - // intvec& operator=(const intvec& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm512_set1_epi64(a)) {} - intvec(const int_t* as) - { - v = _mm512_undefined_epi32(); - // v = _mm512_loadunpacklo_epi32(v, as); - // v = _mm512_loadunpackhi_epi32(v, as+8); - for (int n=0; n<size; ++n) set_elt(n, as[n]); - } - static intvec iota() - { - intvec r; - for (int n=0; n<size; ++n) r.set_elt(n, n); - return r; - } - - operator ivector_t() const { return v; } - int_t operator[](int n) const - { - return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n); - } - intvec_t& set_elt(int n, int_t a) - { - return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this; - } - - - - private: - static __mmask8 mask16tomask8(__mmask16 m16) - { - // combine 01 - m16 = ((m16 >> 1) | m16) & 0b0011001100110011; - // combine 0123 - m16 = ((m16 >> 2) | m16) & 0b0000111100001111; - // combine 01234567 - m16 = ((m16 >> 4) | m16) & 0b0000000011111111; - return m16; - } - public: - boolvec_t as_bool() const { return convert_bool(); } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true - __mmask16 r16 = _mm512_test_epi32_mask(v, v); - return mask16tomask8(r16); - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec operator+() const { return *this; } - intvec operator-() const { return IV(I(0)) - *this; } - intvec operator+(intvec x) const { return _mm512_add_epi64(v, x.v); } - intvec operator-(intvec x) const { return _mm512_sub_epi64(v, x.v); } - - intvec& operator+=(const intvec& x) { return *this=*this+x; } - intvec& operator-=(const intvec& x) { return *this=*this-x; } - - - - intvec operator~() const { return IV(~U(0)) ^ *this; } - intvec operator&(intvec x) const { return _mm512_and_epi64(v, x.v); } - intvec operator|(intvec x) const { return _mm512_or_epi64(v, x.v); } - intvec operator^(intvec x) const { return _mm512_xor_epi64(v, x.v); } - - intvec& operator&=(const intvec& x) { return *this=*this&x; } - intvec& operator|=(const intvec& x) { return *this=*this|x; } - intvec& operator^=(const intvec& x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const; - - - - intvec lsr(int_t n) const - { - if (n < 32) { - __m512i vlo = _mm512_srli_epi32(v, n); - __m512i vhi = _mm512_slli_epi32(v, 32-n); - vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB); - return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo); - } else { - __m512i vlo = _mm512_srli_epi32(v, n-32); - __m512i vhi = _mm512_setzero_epi32(); - return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo); - } - } - intvec_t rotate(int_t n) const; - intvec operator>>(int_t n) const - { - if (n < 32) { - __mm512i vlo = _mm512_srai_epi32(v, n); - __mm512i vlo0 = _mm512_srli_epi32(v, n); - __mm512i vhi = _mm512_slli_epi32(v, 32-n); - vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB); - return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo0); - } else { - __m512i vlo = _mm512_srai_epi32(v, n-32); - __m512i vhi = _mm512_srai_epi32(v, 31); - return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo); - } - } - intvec operator<<(int_t n) const - { - if (n < 32) { - __m512i vlo = _mm512_srli_epi32(v, n); - __m512i vhi = _mm512_slli_epi32(v, 32-n); - vlo = _mm512_swizzle_epi32(vlo, _MM_SWIZ_REG_CDAB); - return _mm512_mask_or_epi32(vhi, 0xb1010101010101010, vhi, vlo); - } else { - __m512i vlo = _mm512_setzero_epi32(); - __m512i vhi = _mm512_slli_epi32(v, n-32); - return _mm512_mask_swizzle_epi32(vhi, 0xb1010101010101010, vlo); - } - } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<<n; } - - intvec lsr(intvec n) const - { - // TODO: improve this - intvec r; - for (int i=0; i<size; ++i) { - r.set_elt(i, U((*this)[i]) >> U(n[i])); - } - return r; - } - intvec_t rotate(intvec_t n) const; - intvec operator>>(intvec n) const - { - // TODO: improve this - intvec r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] >> n[i]); - } - return r; - } - intvec operator<<(intvec n) const - { - // TODO: improve this - intvec r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] << n[i]); - } - return r; - } - intvec& operator>>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<<n; } - - intvec_t clz() const - { - // Return 8*sizeof(TYPE) when the input is 0 - intvec_t r; - for (int i=0; i<size; ++i) { - // __lzcnt64 - r.set_elt(i, __builtin_clzll((*this)[i])); - } - return r; - } - intvec_t popcount() const - { - intvec_t r; - for (int i=0; i<size; ++i) { - // _mm_popcnt_u64 - r.set_elt(i, __builtin_popcountll((*this)[i])); - } - return r; - } - - - - boolvec_t operator==(const intvec& x) const - { - return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_EQ)); - } - boolvec_t operator!=(const intvec& x) const - { - return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_NE)); - } - boolvec_t operator<(const intvec& x) const - { - return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LT)); - } - boolvec_t operator<=(const intvec& x) const - { - return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LE)); - } - boolvec_t operator>(const intvec& x) const - { - return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GT)); - } - boolvec_t operator>=(const intvec& x) const - { - return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GE)); - } - - intvec_t abs() const; - boolvec_t isignbit() const; - intvec_t max(intvec_t x) const; - intvec_t min(intvec_t x) const; - }; - - - - template<> - struct realvec<double,8>: floatprops<double> - { - static const int size = 8; - typedef real_t scalar_t; - typedef __m512d vector_t; - static const int alignment = sizeof(vector_t); - - static const char* name() { return "<MIC:8*double>"; } - void barrier() { __asm__("": "+x"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(const realvec& x): v(x.v) {} - // realvec& operator=(const realvec& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm512_set1_pd(a)) {} - realvec(const real_t* as) - { - v = _mm512_undefined_pd(); - // v = _mm512_loadunpacklo_pd(v, as); - // v = _mm512_loadunpackhi_pd(v, as+8); - for (int n=0; n<size; ++n) set_elt(n, as[n]); - } - - operator vector_t() const { return v; } - real_t operator[](int n) const - { - return vecmathlib::get_elt<RV,vector_t,real_t>(v, n); - } - realvec_t& set_elt(int n, real_t a) - { - return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this; - } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(const real_t* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm512_load_pd(p); - } - static realvec_t loadu(const real_t* p) - { - realvec_t r(_mm512_undefined_pd()); - r.v = _mm512_loadunpacklo_pd(r.v, p); - r.v = _mm512_loadunpackhi_pd(r.v, p+8); - return r.v; - } - static realvec_t loadu(const real_t* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(const real_t* p, const mask_t& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm512_mask_load_pd(v, m.m.v, p); +template <> struct boolvec<double, 8>; +template <> struct intvec<double, 8>; +template <> struct realvec<double, 8>; + +template <> struct boolvec<double, 8> : floatprops<double> { + static const int size = 8; + typedef bool scalar_t; + typedef __mask8 bvector_t; + static const int alignment = sizeof(bvector_t); + + // static_assert(size * sizeof(real_t) == sizeof(bvector_t), + // "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(const boolvec& x): v(x.v) {} + // boolvec& operator=(const boolvec& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v(-bvector_t(a)) {} + boolvec(const bool *as) + : v((bvector_t(as[0]) << 0) | (bvector_t(as[1]) << 1) | + (bvector_t(as[2]) << 2) | (bvector_t(as[3]) << 3) | + (bvector_t(as[4]) << 4) | (bvector_t(as[5]) << 5) | + (bvector_t(as[6]) << 6) | (bvector_t(as[7]) << 7)) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return (v >> n) & 1; } + boolvec &set_elt(int n, bool a) { + v &= ~(bvector_t(1) << n); + v |= bvector_t(a) << n; + return *this; + } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec operator!() const { return _mm512_knot(v); } + + boolvec operator&&(boolvec x) const { return _mm512_kand(v, x.v); } + boolvec operator||(boolvec x) const { return _mm512_kor(v, x.v); } + boolvec operator==(boolvec x) const { return _mm512_kxnor(v, x.v); } + boolvec operator!=(boolvec x) const { return _mm512_kxor(v, x.v); } + + bool all() const { return _mm512_kortestc(v, v); } + bool any() const { return !bool(_mm512_kortestz(v, v)); } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<double, 8> : floatprops<double> { + static const int size = 8; + typedef int_t scalar_t; + typedef __m512i ivector_t; + static const int alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(const intvec& x): v(x.v) {} + // intvec& operator=(const intvec& x) { return v=x.v, *this; } + intvec(ivector_t x) : v(x) {} + intvec(int_t a) : v(_mm512_set1_epi64(a)) {} + intvec(const int_t *as) { + v = _mm512_undefined_epi32(); + // v = _mm512_loadunpacklo_epi32(v, as); + // v = _mm512_loadunpackhi_epi32(v, as+8); + for (int n = 0; n < size; ++n) + set_elt(n, as[n]); + } + static intvec iota() { + intvec r; + for (int n = 0; n < size; ++n) + r.set_elt(n, n); + return r; + } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { + return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n); + } + intvec_t &set_elt(int n, int_t a) { + return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this; + } + +private: + static __mmask8 mask16tomask8(__mmask16 m16) { + // combine 01 + m16 = ((m16 >> 1) | m16) & 0b0011001100110011; + // combine 0123 + m16 = ((m16 >> 2) | m16) & 0b0000111100001111; + // combine 01234567 + m16 = ((m16 >> 4) | m16) & 0b0000000011111111; + return m16; + } + +public: + boolvec_t as_bool() const { return convert_bool(); } + boolvec_t convert_bool() const { + // Result: convert_bool(0)=false, convert_bool(else)=true + __mmask16 r16 = _mm512_test_epi32_mask(v, v); + return mask16tomask8(r16); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + // Note: not all arithmetic operations are supported! + + intvec operator+() const { return *this; } + intvec operator-() const { return IV(I(0)) - *this; } + intvec operator+(intvec x) const { return _mm512_add_epi64(v, x.v); } + intvec operator-(intvec x) const { return _mm512_sub_epi64(v, x.v); } + + intvec &operator+=(const intvec &x) { return *this = *this + x; } + intvec &operator-=(const intvec &x) { return *this = *this - x; } + + intvec operator~() const { return IV(~U(0)) ^ *this; } + intvec operator&(intvec x) const { return _mm512_and_epi64(v, x.v); } + intvec operator|(intvec x) const { return _mm512_or_epi64(v, x.v); } + intvec operator^(intvec x) const { return _mm512_xor_epi64(v, x.v); } + + intvec &operator&=(const intvec &x) { return *this = *this & x; } + intvec &operator|=(const intvec &x) { return *this = *this | x; } + intvec &operator^=(const intvec &x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const; + + intvec lsr(int_t n) const { + if (n < 32) { + __m512i vlo = _mm512_srli_epi32(v, n); + __m512i vhi = _mm512_slli_epi32(v, 32 - n); + vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB); + return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo); + } else { + __m512i vlo = _mm512_srli_epi32(v, n - 32); + __m512i vhi = _mm512_setzero_epi32(); + return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo); } - realvec_t loadu(const real_t* p, const mask_t& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } + } + intvec_t rotate(int_t n) const; + intvec operator>>(int_t n) const { + if (n < 32) { + __mm512i vlo = _mm512_srai_epi32(v, n); + __mm512i vlo0 = _mm512_srli_epi32(v, n); + __mm512i vhi = _mm512_slli_epi32(v, 32 - n); + vhi = _mm512_swizzle_epi32(vhi, _MM_SWIZ_REG_CDAB); + return _mm512_mask_or_epi32(vlo, 0xb0101010101010101, vhi, vlo0); + } else { + __m512i vlo = _mm512_srai_epi32(v, n - 32); + __m512i vhi = _mm512_srai_epi32(v, 31); + return _mm512_mask_swizzle_epi32(vhi, 0xb0101010101010101, vlo); } - realvec_t loadu(const real_t* p, std::ptrdiff_t ioff, const mask_t& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); + } + intvec operator<<(int_t n) const { + if (n < 32) { + __m512i vlo = _mm512_srli_epi32(v, n); + __m512i vhi = _mm512_slli_epi32(v, 32 - n); + vlo = _mm512_swizzle_epi32(vlo, _MM_SWIZ_REG_CDAB); + return _mm512_mask_or_epi32(vhi, 0xb1010101010101010, vhi, vlo); + } else { + __m512i vlo = _mm512_setzero_epi32(); + __m512i vhi = _mm512_slli_epi32(v, n - 32); + return _mm512_mask_swizzle_epi32(vhi, 0xb1010101010101010, vlo); } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm512_store_pd(p, v); + } + intvec &operator>>=(int_t n) { return *this = *this >> n; } + intvec &operator<<=(int_t n) { return *this = *this << n; } + + intvec lsr(intvec n) const { + // TODO: improve this + intvec r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, U((*this)[i]) >> U(n[i])); } - void storeu(real_t* p) const - { - _mm512_packstorelo_pd(p, v); - _mm512_packstorehi_pd(p+8, v); + return r; + } + intvec_t rotate(intvec_t n) const; + intvec operator>>(intvec n) const { + // TODO: improve this + intvec r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] >> n[i]); } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); + return r; + } + intvec operator<<(intvec n) const { + // TODO: improve this + intvec r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] << n[i]); } - void storea(real_t* p, const mask_t& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm512_mask_store_pd(p, m.m.v, v); + return r; + } + intvec &operator>>=(intvec n) { return *this = *this >> n; } + intvec &operator<<=(intvec n) { return *this = *this << n; } + + intvec_t clz() const { + // Return 8*sizeof(TYPE) when the input is 0 + intvec_t r; + for (int i = 0; i < size; ++i) { + // __lzcnt64 + r.set_elt(i, __builtin_clzll((*this)[i])); } - void storeu(real_t* p, const mask_t& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - for (int n=0; n<size; ++n) { - if (m.m[n]) p[n] = (*this)[n]; - } - } + return r; + } + intvec_t popcount() const { + intvec_t r; + for (int i = 0; i < size; ++i) { + // _mm_popcnt_u64 + r.set_elt(i, __builtin_popcountll((*this)[i])); } - void storeu(real_t* p, std::ptrdiff_t ioff, const mask_t& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); + return r; + } + + boolvec_t operator==(const intvec &x) const { + return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_EQ)); + } + boolvec_t operator!=(const intvec &x) const { + return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_NE)); + } + boolvec_t operator<(const intvec &x) const { + return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LT)); + } + boolvec_t operator<=(const intvec &x) const { + return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_LE)); + } + boolvec_t operator>(const intvec &x) const { + return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GT)); + } + boolvec_t operator>=(const intvec &x) const { + return mask16tomask8(_mm512_cmp_epi32_mask(v, x.v, _MM_CMPINT_GE)); + } + + intvec_t abs() const; + boolvec_t isignbit() const; + intvec_t max(intvec_t x) const; + intvec_t min(intvec_t x) const; +}; + +template <> struct realvec<double, 8> : floatprops<double> { + static const int size = 8; + typedef real_t scalar_t; + typedef __m512d vector_t; + static const int alignment = sizeof(vector_t); + + static const char *name() { return "<MIC:8*double>"; } + void barrier() { __asm__("" : "+x"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(const realvec& x): v(x.v) {} + // realvec& operator=(const realvec& x) { return v=x.v, *this; } + realvec(vector_t x) : v(x) {} + realvec(real_t a) : v(_mm512_set1_pd(a)) {} + realvec(const real_t *as) { + v = _mm512_undefined_pd(); + // v = _mm512_loadunpacklo_pd(v, as); + // v = _mm512_loadunpackhi_pd(v, as+8); + for (int n = 0; n < size; ++n) + set_elt(n, as[n]); + } + + operator vector_t() const { return v; } + real_t operator[](int n) const { + return vecmathlib::get_elt<RV, vector_t, real_t>(v, n); + } + realvec_t &set_elt(int n, real_t a) { + return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this; + } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(const real_t *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm512_load_pd(p); + } + static realvec_t loadu(const real_t *p) { + realvec_t r(_mm512_undefined_pd()); + r.v = _mm512_loadunpacklo_pd(r.v, p); + r.v = _mm512_loadunpackhi_pd(r.v, p + 8); + return r.v; + } + static realvec_t loadu(const real_t *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff); + return loadu(p + ioff); + } + realvec_t loada(const real_t *p, const mask_t &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm512_mask_load_pd(v, m.m.v, p); + } + realvec_t loadu(const real_t *p, const mask_t &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); } - - - - intvec_t as_int() const { return _mm512_castpd_si512(v); } - intvec_t convert_int() const - { - intvec_t r(_mm512_undefined_epi32()); - for (int n=0; n<size; ++n) { - r.set_elt(n, floatprops::convert_int((*this)[n])); + } + realvec_t loadu(const real_t *p, std::ptrdiff_t ioff, const mask_t &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff, m); + return loadu(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm512_store_pd(p, v); + } + void storeu(real_t *p) const { + _mm512_packstorelo_pd(p, v); + _mm512_packstorehi_pd(p + 8, v); + } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff); + storeu(p + ioff); + } + void storea(real_t *p, const mask_t &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm512_mask_store_pd(p, m.m.v, v); + } + void storeu(real_t *p, const mask_t &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + for (int n = 0; n < size; ++n) { + if (m.m[n]) + p[n] = (*this)[n]; } - return r; - } - - - - realvec operator+() const { return *this; } - realvec operator-() const { return RV(0.0) - *this; } - - realvec operator+(realvec x) const { return _mm512_add_pd(v, x.v); } - realvec operator-(realvec x) const { return _mm512_sub_pd(v, x.v); } - realvec operator*(realvec x) const { return _mm512_mul_pd(v, x.v); } - realvec operator/(realvec x) const { return _mm512_div_pd(v, x.v); } - - realvec& operator+=(const realvec& x) { return *this=*this+x; } - realvec& operator-=(const realvec& x) { return *this=*this-x; } - realvec& operator*=(const realvec& x) { return *this=*this*x; } - realvec& operator/=(const realvec& x) { return *this=*this/x; } - - real_t maxval() const { returm _mm512_reduce_gmax_pd(v); } - real_t minval() const { returm _mm512_reduce_gmin_pd(v); } - real_t prod() const { returm _mm512_reduce_mul_pd(v); } - real_t sum() const { returm _mm512_reduce_add_pd(v); } - - - - boolvec_t operator==(const realvec& x) const - { - return _mm512_cmp_pd(v, x.v, _CMP_EQ_OQ); - } - boolvec_t operator!=(const realvec& x) const - { - return _mm512_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here - } - boolvec_t operator<(const realvec& x) const - { - return _mm512_cmp_pd(v, x.v, _CMP_LT_OQ); - } - boolvec_t operator<=(const realvec& x) const - { - return _mm512_cmp_pd(v, x.v, _CMP_LE_OQ); - } - boolvec_t operator>(const realvec& x) const - { - return _mm512_cmp_pd(v, x.v, _CMP_GT_OQ); } - boolvec_t operator>=(const realvec& x) const - { - return _mm512_cmp_pd(v, x.v, _CMP_GE_OQ); - } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const { return _mm512_ceil_pd(v); } - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return MF::vml_fabs(*this); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const { return _mm512_floor_pd(v); } - realvec fma(realvec y, realvec z) const - { - return _mm512_fmadd_pd(v, x.v, y.v); + } + void storeu(real_t *p, std::ptrdiff_t ioff, const mask_t &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff, m); + storeu(p + ioff, m); + } + + intvec_t as_int() const { return _mm512_castpd_si512(v); } + intvec_t convert_int() const { + intvec_t r(_mm512_undefined_epi32()); + for (int n = 0; n < size; ++n) { + r.set_elt(n, floatprops::convert_int((*this)[n])); } - realvec fmax(realvec y) const { return _mm512_gmax_pd(v, y.v); } - realvec fmin(realvec y) const { return _mm512_gmin_pd(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const - { + return r; + } + + realvec operator+() const { return *this; } + realvec operator-() const { return RV(0.0) - *this; } + + realvec operator+(realvec x) const { return _mm512_add_pd(v, x.v); } + realvec operator-(realvec x) const { return _mm512_sub_pd(v, x.v); } + realvec operator*(realvec x) const { return _mm512_mul_pd(v, x.v); } + realvec operator/(realvec x) const { return _mm512_div_pd(v, x.v); } + + realvec &operator+=(const realvec &x) { return *this = *this + x; } + realvec &operator-=(const realvec &x) { return *this = *this - x; } + realvec &operator*=(const realvec &x) { return *this = *this * x; } + realvec &operator/=(const realvec &x) { return *this = *this / x; } + + real_t maxval() const { returm _mm512_reduce_gmax_pd(v); } + real_t minval() const { returm _mm512_reduce_gmin_pd(v); } + real_t prod() const { returm _mm512_reduce_mul_pd(v); } + real_t sum() const { returm _mm512_reduce_add_pd(v); } + + boolvec_t operator==(const realvec &x) const { + return _mm512_cmp_pd(v, x.v, _CMP_EQ_OQ); + } + boolvec_t operator!=(const realvec &x) const { + return _mm512_cmp_pd(v, x.v, _CMP_NEQ_UQ); // Note: _UQ here + } + boolvec_t operator<(const realvec &x) const { + return _mm512_cmp_pd(v, x.v, _CMP_LT_OQ); + } + boolvec_t operator<=(const realvec &x) const { + return _mm512_cmp_pd(v, x.v, _CMP_LE_OQ); + } + boolvec_t operator>(const realvec &x) const { + return _mm512_cmp_pd(v, x.v, _CMP_GT_OQ); + } + boolvec_t operator>=(const realvec &x) const { + return _mm512_cmp_pd(v, x.v, _CMP_GE_OQ); + } + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const { return _mm512_ceil_pd(v); } + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return MF::vml_fabs(*this); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const { return _mm512_floor_pd(v); } + realvec fma(realvec y, realvec z) const { + return _mm512_fmadd_pd(v, x.v, y.v); + } + realvec fmax(realvec y) const { return _mm512_gmax_pd(v, y.v); } + realvec fmin(realvec y) const { return _mm512_gmin_pd(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { #ifdef VML_HAVE_NAN - return _mm512_cmp_pd(v, v, _CMP_UNORD_Q); + return _mm512_cmp_pd(v, v, _CMP_UNORD_Q); #else - return BV(false); + return BV(false); #endif - } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return _mm512_fmadd_pd(v, x.v, y.v); - } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const { return _mm512_div_pd(_mm512_set1_pd(1.0), v); } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const - { - return _mm512_round_pd(v, _MM_FROUND_TO_NEAREST_INT); - } - realvec round() const { return MF::vml_round(*this); } - realvec rsqrt() const { return MF::vml_rsqrt(*this); } - boolvec_t signbit() const { return as_int().signbit(); } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - realvec sqrt() const { return _mm512_sqrt_pd(v); } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const { return _mm512_round_pd(v, _MM_FROUND_TO_ZERO); } - }; - - - - // boolvec definitions - - inline intvec<double,4> boolvec<double,4>::as_int() const - { - return _mm512_castpd_si512(v); - } - - inline intvec<double,4> boolvec<double,4>::convert_int() const - { - return ifthen(v, IV(I(1)), IV(I(0))); - } - - inline - boolvec<double,4> boolvec<double,4>::ifthen(boolvec_t x, boolvec_t y) const - { - return (v & x.v) | (~v & y.v); - } - - inline - intvec<double,4> boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const - { - return _mm512_blend_epi64(v, y.v, x.v) - } - - inline - realvec<double,4> boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const - { - return _mm512_blend_pd(v, y.v, x.v) - } - - - - // intvec definitions - - inline realvec<double,4> intvec<double,4>::as_float() const - { - return _mm512_castsi512_pd(v); - } - - inline realvec<double,4> intvec<double,4>::convert_float() const - { - intvec_t r(_mm512_undefined_pd()); - for (int n=0; n<size; ++n) { - r.set_elt(n, floatprops::convert_float((*this)[n])); - } - return r; } - - inline intvec<double,8> intvec<double,8>::abs() const - { - return MF::vml_abs(*this); - } - - inline intvec<double,8> intvec<double,8>::bitifthen(intvec_t x, - intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - inline boolvec<double,8> intvec<double,8>::isignbit() const - { - return MF::vml_isignbit(*this); - } - - inline intvec<double,8> intvec<double,8>::max(intvec_t x) const - { - return MF::vml_max(*this, x); - } - - inline intvec<double,8> intvec<double,8>::min(intvec_t x) const - { - return MF::vml_min(*this, x); - } - - inline intvec<double,8> intvec<double,8>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); - } - - inline intvec<double,8> intvec<double,8>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); - } - + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec_t mad(realvec_t y, realvec_t z) const { + return _mm512_fmadd_pd(v, x.v, y.v); + } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const { return _mm512_div_pd(_mm512_set1_pd(1.0), v); } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const { return _mm512_round_pd(v, _MM_FROUND_TO_NEAREST_INT); } + realvec round() const { return MF::vml_round(*this); } + realvec rsqrt() const { return MF::vml_rsqrt(*this); } + boolvec_t signbit() const { return as_int().signbit(); } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + realvec sqrt() const { return _mm512_sqrt_pd(v); } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const { return _mm512_round_pd(v, _MM_FROUND_TO_ZERO); } +}; + +// boolvec definitions + +inline intvec<double, 4> boolvec<double, 4>::as_int() const { + return _mm512_castpd_si512(v); +} + +inline intvec<double, 4> boolvec<double, 4>::convert_int() const { + return ifthen(v, IV(I(1)), IV(I(0))); +} + +inline boolvec<double, 4> boolvec<double, 4>::ifthen(boolvec_t x, + boolvec_t y) const { + return (v & x.v) | (~v & y.v); +} + +inline intvec<double, 4> boolvec<double, 4>::ifthen(intvec_t x, + intvec_t y) const { + return _mm512_blend_epi64(v, y.v, x.v) +} + +inline realvec<double, 4> boolvec<double, 4>::ifthen(realvec_t x, + realvec_t y) const { + return _mm512_blend_pd(v, y.v, x.v) +} + +// intvec definitions + +inline realvec<double, 4> intvec<double, 4>::as_float() const { + return _mm512_castsi512_pd(v); +} + +inline realvec<double, 4> intvec<double, 4>::convert_float() const { + intvec_t r(_mm512_undefined_pd()); + for (int n = 0; n < size; ++n) { + r.set_elt(n, floatprops::convert_float((*this)[n])); + } + return r; +} + +inline intvec<double, 8> intvec<double, 8>::abs() const { + return MF::vml_abs(*this); +} + +inline intvec<double, 8> intvec<double, 8>::bitifthen(intvec_t x, + intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); +} + +inline boolvec<double, 8> intvec<double, 8>::isignbit() const { + return MF::vml_isignbit(*this); +} + +inline intvec<double, 8> intvec<double, 8>::max(intvec_t x) const { + return MF::vml_max(*this, x); +} + +inline intvec<double, 8> intvec<double, 8>::min(intvec_t x) const { + return MF::vml_min(*this, x); +} + +inline intvec<double, 8> intvec<double, 8>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +inline intvec<double, 8> intvec<double, 8>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + } // namespace vecmathlib -#endif // #ifndef VEC_MIC_DOUBLE8_H +#endif // #ifndef VEC_MIC_DOUBLE8_H diff --git a/vec_neon_float2.h b/vec_neon_float2.h index 3a21a05..6df9969 100644 --- a/vec_neon_float2.h +++ b/vec_neon_float2.h @@ -14,608 +14,511 @@ // Neon intrinsics #include <arm_neon.h> - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_FLOAT_2 - template<> struct boolvec<float,2>; - template<> struct intvec<float,2>; - template<> struct realvec<float,2>; - - - - template<> - struct boolvec<float,2>: floatprops<float> - { - static int const size = 2; - typedef bool scalar_t; - typedef uint32x2_t bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values are -1, false values are 0 - static uint_t from_bool(bool a) { return -int_t(a); } - static bool to_bool(uint_t a) { return a; } - public: - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(vdup_n_u32(from_bool(a))) {} - boolvec(bool const* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - - operator bvector_t() const { return v; } - bool operator[](int n) const - { - return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n)); - } - boolvec& set_elt(int n, bool a) - { - return - vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return vmvn_u32(v); } - - boolvec operator&&(boolvec x) const { return vand_u32(v, x.v); } - boolvec operator||(boolvec x) const { return vorr_u32(v, x.v); } - boolvec operator==(boolvec x) const { return vceq_u32(v, x.v); } - boolvec operator!=(boolvec x) const { return veor_u32(v, x.v); } - - bool all() const - { - boolvec r = vpmin_u32(v, v); - return r[0]; - } - bool any() const - { - boolvec r = vpmax_u32(v, v); - return r[0]; - } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<float,2>: floatprops<float> - { - static int const size = 2; - typedef int_t scalar_t; - typedef int32x2_t ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(vdup_n_s32(a)) {} - intvec(int_t const* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - static intvec iota() - { - return vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0)); - } - - operator ivector_t() const { return v; } - int_t operator[](int n) const - { - return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n); - } - intvec_t& set_elt(int n, int_t a) - { - return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this; - } - - - - // Vector casts do not change the bit battern - boolvec_t as_bool() const { return vreinterpret_u32_s32(v); } - boolvec_t convert_bool() const { return *this != IV(0); } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - intvec operator+() const { return *this; } - intvec operator-() const { return vneg_s32(v); } - - intvec operator+(intvec x) const { return vadd_s32(v, x.v); } - intvec operator-(intvec x) const { return vsub_s32(v, x.v); } - intvec operator*(intvec x) const { return vmul_s32(v, x.v); } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - intvec& operator*=(intvec const& x) { return *this=*this*x; } - - - - intvec operator~() const { return vmvn_s32(v); } - - intvec operator&(intvec x) const { return vand_s32(v, x.v); } - intvec operator|(intvec x) const { return vorr_s32(v, x.v); } - intvec operator^(intvec x) const { return veor_s32(v, x.v); } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const - { - return vbsl_s32(vreinterpret_u32_s32(v), x.v, y.v); - } - - - - intvec_t lsr(int_t n) const { return lsr(IV(n)); } - intvec_t rotate(int_t n) const; - intvec operator>>(int_t n) const { return *this >> IV(n); } - intvec operator<<(int_t n) const { return *this << IV(n); } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<<n; } - - intvec lsr(intvec n) const - { - return vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v), (-n).v)); - } - intvec_t rotate(intvec_t n) const; - intvec operator>>(intvec n) const - { - return vshl_s32(v, (-n).v); - } - intvec operator<<(intvec n) const - { - return vshl_s32(v, n.v); - } - intvec& operator>>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<<n; } - - intvec_t clz() const { return vclz_s32(v); } - intvec_t popcount() const - { - return vpaddl_s16(vpaddl_s8(vcnt_s8(vreinterpret_s8_s32(v)))); - } - - - - boolvec_t operator==(intvec const& x) const { return vceq_s32(v, x.v); } - boolvec_t operator!=(intvec const& x) const { return !(*this == x); } - boolvec_t operator<(intvec const& x) const { return vclt_s32(v, x.v); } - boolvec_t operator<=(intvec const& x) const { return vcle_s32(v, x.v); } - boolvec_t operator>(intvec const& x) const { return vcgt_s32(v, x.v); } - boolvec_t operator>=(intvec const& x) const { return vcge_s32(v, x.v); } - - intvec_t abs() const { return vabs_s32(v); } - boolvec_t isignbit() const - { - //return *this < IV(I(0)); - return intvec(vshr_n_s32(v, FP::bits-1)).as_bool(); - } - intvec_t max(intvec_t x) const { return vmax_s32(v, x.v); } - intvec_t min(intvec_t x) const { return vmin_s32(v, x.v); } - }; - - - - template<> - struct realvec<float,2>: floatprops<float> - { - static int const size = 2; - typedef real_t scalar_t; - typedef float32x2_t vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return "<NEON:2*float>"; } - void barrier() { __asm__("": "+w"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(vdup_n_f32(a)) {} - realvec(real_t const* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - - operator vector_t() const { return v; } - real_t operator[](int n) const - { - return vecmathlib::get_elt<RV,vector_t,real_t>(v, n); - } - realvec_t& set_elt(int n, real_t a) - { - return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this; - } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return vld1_f32(p); - } - static realvec_t loadu(real_t const* p) - { +template <> struct boolvec<float, 2>; +template <> struct intvec<float, 2>; +template <> struct realvec<float, 2>; + +template <> struct boolvec<float, 2> : floatprops<float> { + static int const size = 2; + typedef bool scalar_t; + typedef uint32x2_t bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + +private: + // true values are -1, false values are 0 + static uint_t from_bool(bool a) { return -int_t(a); } + static bool to_bool(uint_t a) { return a; } + +public: + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v(vdup_n_u32(from_bool(a))) {} + boolvec(bool const *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + + operator bvector_t() const { return v; } + bool operator[](int n) const { + return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n)); + } + boolvec &set_elt(int n, bool a) { + return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)), + *this; + } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec operator!() const { return vmvn_u32(v); } + + boolvec operator&&(boolvec x) const { return vand_u32(v, x.v); } + boolvec operator||(boolvec x) const { return vorr_u32(v, x.v); } + boolvec operator==(boolvec x) const { return vceq_u32(v, x.v); } + boolvec operator!=(boolvec x) const { return veor_u32(v, x.v); } + + bool all() const { + boolvec r = vpmin_u32(v, v); + return r[0]; + } + bool any() const { + boolvec r = vpmax_u32(v, v); + return r[0]; + } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<float, 2> : floatprops<float> { + static int const size = 2; + typedef int_t scalar_t; + typedef int32x2_t ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x) : v(x) {} + intvec(int_t a) : v(vdup_n_s32(a)) {} + intvec(int_t const *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + static intvec iota() { + return vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0)); + } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { + return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n); + } + intvec_t &set_elt(int n, int_t a) { + return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this; + } + + // Vector casts do not change the bit battern + boolvec_t as_bool() const { return vreinterpret_u32_s32(v); } + boolvec_t convert_bool() const { return *this != IV(0); } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + intvec operator+() const { return *this; } + intvec operator-() const { return vneg_s32(v); } + + intvec operator+(intvec x) const { return vadd_s32(v, x.v); } + intvec operator-(intvec x) const { return vsub_s32(v, x.v); } + intvec operator*(intvec x) const { return vmul_s32(v, x.v); } + + intvec &operator+=(intvec const &x) { return *this = *this + x; } + intvec &operator-=(intvec const &x) { return *this = *this - x; } + intvec &operator*=(intvec const &x) { return *this = *this * x; } + + intvec operator~() const { return vmvn_s32(v); } + + intvec operator&(intvec x) const { return vand_s32(v, x.v); } + intvec operator|(intvec x) const { return vorr_s32(v, x.v); } + intvec operator^(intvec x) const { return veor_s32(v, x.v); } + + intvec &operator&=(intvec const &x) { return *this = *this & x; } + intvec &operator|=(intvec const &x) { return *this = *this | x; } + intvec &operator^=(intvec const &x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const { + return vbsl_s32(vreinterpret_u32_s32(v), x.v, y.v); + } + + intvec_t lsr(int_t n) const { return lsr(IV(n)); } + intvec_t rotate(int_t n) const; + intvec operator>>(int_t n) const { return *this >> IV(n); } + intvec operator<<(int_t n) const { return *this << IV(n); } + intvec &operator>>=(int_t n) { return *this = *this >> n; } + intvec &operator<<=(int_t n) { return *this = *this << n; } + + intvec lsr(intvec n) const { + return vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v), (-n).v)); + } + intvec_t rotate(intvec_t n) const; + intvec operator>>(intvec n) const { return vshl_s32(v, (-n).v); } + intvec operator<<(intvec n) const { return vshl_s32(v, n.v); } + intvec &operator>>=(intvec n) { return *this = *this >> n; } + intvec &operator<<=(intvec n) { return *this = *this << n; } + + intvec_t clz() const { return vclz_s32(v); } + intvec_t popcount() const { + return vpaddl_s16(vpaddl_s8(vcnt_s8(vreinterpret_s8_s32(v)))); + } + + boolvec_t operator==(intvec const &x) const { return vceq_s32(v, x.v); } + boolvec_t operator!=(intvec const &x) const { return !(*this == x); } + boolvec_t operator<(intvec const &x) const { return vclt_s32(v, x.v); } + boolvec_t operator<=(intvec const &x) const { return vcle_s32(v, x.v); } + boolvec_t operator>(intvec const &x) const { return vcgt_s32(v, x.v); } + boolvec_t operator>=(intvec const &x) const { return vcge_s32(v, x.v); } + + intvec_t abs() const { return vabs_s32(v); } + boolvec_t isignbit() const { + // return *this < IV(I(0)); + return intvec(vshr_n_s32(v, FP::bits - 1)).as_bool(); + } + intvec_t max(intvec_t x) const { return vmax_s32(v, x.v); } + intvec_t min(intvec_t x) const { return vmin_s32(v, x.v); } +}; + +template <> struct realvec<float, 2> : floatprops<float> { + static int const size = 2; + typedef real_t scalar_t; + typedef float32x2_t vector_t; + static int const alignment = sizeof(vector_t); + + static char const *name() { return "<NEON:2*float>"; } + void barrier() { __asm__("" : "+w"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x) : v(x) {} + realvec(real_t a) : v(vdup_n_f32(a)) {} + realvec(real_t const *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + + operator vector_t() const { return v; } + real_t operator[](int n) const { + return vecmathlib::get_elt<RV, vector_t, real_t>(v, n); + } + realvec_t &set_elt(int n, real_t a) { + return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this; + } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return vld1_f32(p); + } + static realvec_t loadu(real_t const *p) { #if defined __ARM_FEATURE_UNALIGNED - return vld1_f32(p); + return vld1_f32(p); #else - realvec_t r; - r.set_elt(0, p[0]); - r.set_elt(1, p[1]); - return r; + realvec_t r; + r.set_elt(0, p[0]); + r.set_elt(1, p[1]); + return r; #endif + } + static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff); + return loadu(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - vst1_f32(p, v); + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); } - void storeu(real_t* p) const - { - // Vector stores would require vector loads, which would need to - // be atomic + } + realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff, m); + return loadu(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + vst1_f32(p, v); + } + void storeu(real_t *p) const { +// Vector stores would require vector loads, which would need to +// be atomic #if defined __ARM_FEATURE_UNALIGNED - vst1_f32(p, v); + vst1_f32(p, v); #else - p[0] = (*this)[0]; - p[1] = (*this)[1]; + p[0] = (*this)[0]; + p[1] = (*this)[1]; #endif + } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff); + storeu(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + if (m.m[0]) + p[0] = (*this)[0]; + if (m.m[1]) + p[1] = (*this)[1]; } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return vreinterpret_s32_f32(v); } - intvec_t convert_int() const { return vcvt_s32_f32(v); } - - - - realvec operator+() const { return *this; } - realvec operator-() const { return vneg_f32(v); } - - realvec operator+(realvec x) const { return vadd_f32(v, x.v); } - realvec operator-(realvec x) const { return vsub_f32(v, x.v); } - realvec operator*(realvec x) const { return vmul_f32(v, x.v); } - realvec operator/(realvec x) const { return *this * x.rcp(); } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t maxval() const - { - realvec r = vpmax_f32(v, v); - return r[0]; - } - real_t minval() const - { - realvec r = vpmin_f32(v, v); - return r[0]; - } - real_t prod() const - { - return (*this)[0] * (*this)[1]; - } - real_t sum() const - { - realvec r = vpadd_f32(v, v); - return r[0]; - } - - - - boolvec_t operator==(realvec const& x) const { return vceq_f32(v, x.v); } - boolvec_t operator!=(realvec const& x) const { return !(*this == x); } - boolvec_t operator<(realvec const& x) const { return vclt_f32(v, x.v); } - boolvec_t operator<=(realvec const& x) const { return vcle_f32(v, x.v); } - boolvec_t operator>(realvec const& x) const { return vcgt_f32(v, x.v); } - boolvec_t operator>=(realvec const& x) const { return vcge_f32(v, x.v); } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const - { - // return vrndp_f32(v); - return MF::vml_ceil(*this); - } - realvec copysign(realvec y) const - { - return vbsl_f32(vdup_n_u32(FP::signbit_mask), y.v, v); - } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return vabs_f32(v); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const - { - // return vrndm_f32(v); - return MF::vml_floor(*this); - } - realvec_t fma(realvec_t y, realvec_t z) const - { - return vfma_f32(z.v, v, y.v); - } - realvec fmax(realvec y) const { return vmax_f32(v, y.v); } - realvec fmin(realvec y) const { return vmin_f32(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return MF::vml_isnan(*this); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec_t mad(realvec_t y, realvec_t z) const - { - // TODO: vfma_f32 - return vmla_f32(z.v, v, y.v); - } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const - { - realvec r = vrecpe_f32(v); - r *= vrecps_f32(v, r); - r *= vrecps_f32(v, r); - return r; - } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const - { - // return vrndn_f32(v); - return MF::vml_rint(*this); - } - realvec round() const - { - // return vrnda_f32(v); - return MF::vml_round(*this); - } - realvec rsqrt() const - { - realvec r = vrsqrte_f32(v); - r *= vrsqrts_f32(v, r*r); - r *= vrsqrts_f32(v, r*r); - return r; - } - boolvec_t signbit() const { return MF::vml_signbit(*this); } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - realvec sqrt() const { return *this * rsqrt(); } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const - { - // return vrnd_f32(v); - return MF::vml_trunc(*this); + } + void storeu(real_t *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) + p[0] = (*this)[0]; + if (m.m[1]) + p[1] = (*this)[1]; } - }; - - - - // boolvec definitions - - inline intvec<float,2> boolvec<float,2>::as_int() const - { - return vreinterpret_s32_u32(v); - } - - inline intvec<float,2> boolvec<float,2>::convert_int() const - { - return - as_int(); - } - - inline - boolvec<float,2> boolvec<float,2>::ifthen(boolvec_t x, boolvec_t y) const - { - return vbsl_u32(v, x.v, y.v); - } - - inline intvec<float,2> boolvec<float,2>::ifthen(intvec_t x, intvec_t y) const - { - return vbsl_s32(v, x.v, y.v); - } - - inline - realvec<float,2> boolvec<float,2>::ifthen(realvec_t x, realvec_t y) const - { - return vbsl_f32(v, x.v, y.v); - } - - - - // intvec definitions - - inline realvec<float,2> intvec<float,2>::as_float() const - { - return vreinterpret_f32_s32(v); - } - - inline realvec<float,2> intvec<float,2>::convert_float() const - { - return vcvt_f32_s32(v); - } - - inline intvec<float,2> intvec<float,2>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); - } - - inline intvec<float,2> intvec<float,2>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); - } - + } + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff, m); + storeu(p + ioff, m); + } + + intvec_t as_int() const { return vreinterpret_s32_f32(v); } + intvec_t convert_int() const { return vcvt_s32_f32(v); } + + realvec operator+() const { return *this; } + realvec operator-() const { return vneg_f32(v); } + + realvec operator+(realvec x) const { return vadd_f32(v, x.v); } + realvec operator-(realvec x) const { return vsub_f32(v, x.v); } + realvec operator*(realvec x) const { return vmul_f32(v, x.v); } + realvec operator/(realvec x) const { return *this * x.rcp(); } + + realvec &operator+=(realvec const &x) { return *this = *this + x; } + realvec &operator-=(realvec const &x) { return *this = *this - x; } + realvec &operator*=(realvec const &x) { return *this = *this * x; } + realvec &operator/=(realvec const &x) { return *this = *this / x; } + + real_t maxval() const { + realvec r = vpmax_f32(v, v); + return r[0]; + } + real_t minval() const { + realvec r = vpmin_f32(v, v); + return r[0]; + } + real_t prod() const { return (*this)[0] * (*this)[1]; } + real_t sum() const { + realvec r = vpadd_f32(v, v); + return r[0]; + } + + boolvec_t operator==(realvec const &x) const { return vceq_f32(v, x.v); } + boolvec_t operator!=(realvec const &x) const { return !(*this == x); } + boolvec_t operator<(realvec const &x) const { return vclt_f32(v, x.v); } + boolvec_t operator<=(realvec const &x) const { return vcle_f32(v, x.v); } + boolvec_t operator>(realvec const &x) const { return vcgt_f32(v, x.v); } + boolvec_t operator>=(realvec const &x) const { return vcge_f32(v, x.v); } + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const { + // return vrndp_f32(v); + return MF::vml_ceil(*this); + } + realvec copysign(realvec y) const { + return vbsl_f32(vdup_n_u32(FP::signbit_mask), y.v, v); + } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return vabs_f32(v); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const { + // return vrndm_f32(v); + return MF::vml_floor(*this); + } + realvec_t fma(realvec_t y, realvec_t z) const { + return vfma_f32(z.v, v, y.v); + } + realvec fmax(realvec y) const { return vmax_f32(v, y.v); } + realvec fmin(realvec y) const { return vmin_f32(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec_t mad(realvec_t y, realvec_t z) const { + // TODO: vfma_f32 + return vmla_f32(z.v, v, y.v); + } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const { + realvec r = vrecpe_f32(v); + r *= vrecps_f32(v, r); + r *= vrecps_f32(v, r); + return r; + } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const { + // return vrndn_f32(v); + return MF::vml_rint(*this); + } + realvec round() const { + // return vrnda_f32(v); + return MF::vml_round(*this); + } + realvec rsqrt() const { + realvec r = vrsqrte_f32(v); + r *= vrsqrts_f32(v, r * r); + r *= vrsqrts_f32(v, r * r); + return r; + } + boolvec_t signbit() const { return MF::vml_signbit(*this); } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + realvec sqrt() const { return *this * rsqrt(); } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const { + // return vrnd_f32(v); + return MF::vml_trunc(*this); + } +}; + +// boolvec definitions + +inline intvec<float, 2> boolvec<float, 2>::as_int() const { + return vreinterpret_s32_u32(v); +} + +inline intvec<float, 2> boolvec<float, 2>::convert_int() const { + return -as_int(); +} + +inline boolvec<float, 2> boolvec<float, 2>::ifthen(boolvec_t x, + boolvec_t y) const { + return vbsl_u32(v, x.v, y.v); +} + +inline intvec<float, 2> boolvec<float, 2>::ifthen(intvec_t x, + intvec_t y) const { + return vbsl_s32(v, x.v, y.v); +} + +inline realvec<float, 2> boolvec<float, 2>::ifthen(realvec_t x, + realvec_t y) const { + return vbsl_f32(v, x.v, y.v); +} + +// intvec definitions + +inline realvec<float, 2> intvec<float, 2>::as_float() const { + return vreinterpret_f32_s32(v); +} + +inline realvec<float, 2> intvec<float, 2>::convert_float() const { + return vcvt_f32_s32(v); +} + +inline intvec<float, 2> intvec<float, 2>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +inline intvec<float, 2> intvec<float, 2>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + } // namespace vecmathlib -#endif // #ifndef VEC_NEON_FLOAT2_H +#endif // #ifndef VEC_NEON_FLOAT2_H diff --git a/vec_neon_float4.h b/vec_neon_float4.h index 2bd9dda..9ec1e79 100644 --- a/vec_neon_float4.h +++ b/vec_neon_float4.h @@ -14,628 +14,537 @@ // Neon intrinsics #include <arm_neon.h> - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_FLOAT_4 - template<> struct boolvec<float,4>; - template<> struct intvec<float,4>; - template<> struct realvec<float,4>; - - - - template<> - struct boolvec<float,4>: floatprops<float> - { - static int const size = 4; - typedef bool scalar_t; - typedef uint32x4_t bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values are -1, false values are 0 - static uint_t from_bool(bool a) { return -int_t(a); } - static bool to_bool(uint_t a) { return a; } - public: - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(vdupq_n_u32(from_bool(a))) {} - boolvec(bool const* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - - operator bvector_t() const { return v; } - bool operator[](int n) const - { - return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n)); - } - boolvec& set_elt(int n, bool a) - { - return - vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return vmvnq_u32(v); } - - boolvec operator&&(boolvec x) const { return vandq_u32(v, x.v); } - boolvec operator||(boolvec x) const { return vorrq_u32(v, x.v); } - boolvec operator==(boolvec x) const { return vceqq_u32(v, x.v); } - boolvec operator!=(boolvec x) const { return veorq_u32(v, x.v); } - - bool all() const - { - uint32x2_t x = vpmin_u32(vget_low_u32(v), vget_high_u32(v)); - uint32x2_t y = vpmin_u32(x, x); - uint32_t z = vget_lane_u32(y, 0); - return to_bool(z); - } - bool any() const - { - uint32x2_t x = vpmax_u32(vget_low_u32(v), vget_high_u32(v)); - uint32x2_t y = vpmax_u32(x, x); - uint32_t z = vget_lane_u32(y, 0); - return to_bool(z); - } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<float,4>: floatprops<float> - { - static int const size = 4; - typedef int_t scalar_t; - typedef int32x4_t ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(vdupq_n_s32(a)) {} - intvec(int_t const* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - static intvec iota() - { - return - vcombine_s32(vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0)), - vcreate_s32((uint64_t(3) << uint64_t(32)) | uint64_t(2))); - } - - operator ivector_t() const { return v; } - int_t operator[](int n) const - { - return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n); - } - intvec_t& set_elt(int n, int_t a) - { - return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this; - } - - - - // Vector casts do not change the bit battern - boolvec_t as_bool() const { return vreinterpretq_u32_s32(v); } - boolvec_t convert_bool() const { return *this != IV(0); } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - intvec operator+() const { return *this; } - intvec operator-() const { return vnegq_s32(v); } - - intvec operator+(intvec x) const { return vaddq_s32(v, x.v); } - intvec operator-(intvec x) const { return vsubq_s32(v, x.v); } - intvec operator*(intvec x) const { return vmulq_s32(v, x.v); } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - intvec& operator*=(intvec const& x) { return *this=*this*x; } - - - - intvec operator~() const { return vmvnq_s32(v); } - - intvec operator&(intvec x) const { return vandq_s32(v, x.v); } - intvec operator|(intvec x) const { return vorrq_s32(v, x.v); } - intvec operator^(intvec x) const { return veorq_s32(v, x.v); } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const - { - return vbslq_s32(vreinterpretq_u32_s32(v), x.v, y.v); - } - - - - intvec_t lsr(int_t n) const { return lsr(IV(n)); } - intvec_t rotate(int_t n) const; - intvec operator>>(int_t n) const { return *this >> IV(n); } - intvec operator<<(int_t n) const { return *this << IV(n); } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<<n; } - - intvec_t lsr(intvec_t n) const - { - return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v), (-n).v)); - } - intvec_t rotate(intvec_t n) const; - intvec operator>>(intvec n) const - { - return vshlq_s32(v, (-n).v); - } - intvec operator<<(intvec n) const - { - return vshlq_s32(v, n.v); - } - intvec& operator>>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<<n; } - - intvec_t clz() const { return vclzq_s32(v); } - intvec_t popcount() const - { - return vpaddlq_s16(vpaddlq_s8(vcntq_s8(vreinterpretq_s8_s32(v)))); - } - - - - boolvec_t operator==(intvec const& x) const { return vceqq_s32(v, x.v); } - boolvec_t operator!=(intvec const& x) const { return !(*this == x); } - boolvec_t operator<(intvec const& x) const { return vcltq_s32(v, x.v); } - boolvec_t operator<=(intvec const& x) const { return vcleq_s32(v, x.v); } - boolvec_t operator>(intvec const& x) const { return vcgtq_s32(v, x.v); } - boolvec_t operator>=(intvec const& x) const { return vcgeq_s32(v, x.v); } - - intvec_t abs() const { return vabsq_s32(v); } - boolvec_t isignbit() const - { - //return *this < IV(I(0)); - return intvec(vshrq_n_s32(v, FP::bits-1)).as_bool(); - } - intvec_t max(intvec_t x) const { return vmaxq_s32(v, x.v); } - intvec_t min(intvec_t x) const { return vminq_s32(v, x.v); } - }; - - - - template<> - struct realvec<float,4>: floatprops<float> - { - static int const size = 4; - typedef real_t scalar_t; - typedef float32x4_t vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return "<NEON:4*float>"; } - void barrier() { __asm__("": "+w"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(vdupq_n_f32(a)) {} - realvec(real_t const* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - - operator vector_t() const { return v; } - real_t operator[](int n) const - { - return vecmathlib::get_elt<RV,vector_t,real_t>(v, n); - } - realvec_t& set_elt(int n, real_t a) - { - return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this; - } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return vld1q_f32(p); - } - static realvec_t loadu(real_t const* p) - { +template <> struct boolvec<float, 4>; +template <> struct intvec<float, 4>; +template <> struct realvec<float, 4>; + +template <> struct boolvec<float, 4> : floatprops<float> { + static int const size = 4; + typedef bool scalar_t; + typedef uint32x4_t bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + +private: + // true values are -1, false values are 0 + static uint_t from_bool(bool a) { return -int_t(a); } + static bool to_bool(uint_t a) { return a; } + +public: + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v(vdupq_n_u32(from_bool(a))) {} + boolvec(bool const *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + + operator bvector_t() const { return v; } + bool operator[](int n) const { + return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n)); + } + boolvec &set_elt(int n, bool a) { + return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)), + *this; + } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec operator!() const { return vmvnq_u32(v); } + + boolvec operator&&(boolvec x) const { return vandq_u32(v, x.v); } + boolvec operator||(boolvec x) const { return vorrq_u32(v, x.v); } + boolvec operator==(boolvec x) const { return vceqq_u32(v, x.v); } + boolvec operator!=(boolvec x) const { return veorq_u32(v, x.v); } + + bool all() const { + uint32x2_t x = vpmin_u32(vget_low_u32(v), vget_high_u32(v)); + uint32x2_t y = vpmin_u32(x, x); + uint32_t z = vget_lane_u32(y, 0); + return to_bool(z); + } + bool any() const { + uint32x2_t x = vpmax_u32(vget_low_u32(v), vget_high_u32(v)); + uint32x2_t y = vpmax_u32(x, x); + uint32_t z = vget_lane_u32(y, 0); + return to_bool(z); + } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<float, 4> : floatprops<float> { + static int const size = 4; + typedef int_t scalar_t; + typedef int32x4_t ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x) : v(x) {} + intvec(int_t a) : v(vdupq_n_s32(a)) {} + intvec(int_t const *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + static intvec iota() { + return vcombine_s32( + vcreate_s32((uint64_t(1) << uint64_t(32)) | uint64_t(0)), + vcreate_s32((uint64_t(3) << uint64_t(32)) | uint64_t(2))); + } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { + return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n); + } + intvec_t &set_elt(int n, int_t a) { + return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this; + } + + // Vector casts do not change the bit battern + boolvec_t as_bool() const { return vreinterpretq_u32_s32(v); } + boolvec_t convert_bool() const { return *this != IV(0); } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + intvec operator+() const { return *this; } + intvec operator-() const { return vnegq_s32(v); } + + intvec operator+(intvec x) const { return vaddq_s32(v, x.v); } + intvec operator-(intvec x) const { return vsubq_s32(v, x.v); } + intvec operator*(intvec x) const { return vmulq_s32(v, x.v); } + + intvec &operator+=(intvec const &x) { return *this = *this + x; } + intvec &operator-=(intvec const &x) { return *this = *this - x; } + intvec &operator*=(intvec const &x) { return *this = *this * x; } + + intvec operator~() const { return vmvnq_s32(v); } + + intvec operator&(intvec x) const { return vandq_s32(v, x.v); } + intvec operator|(intvec x) const { return vorrq_s32(v, x.v); } + intvec operator^(intvec x) const { return veorq_s32(v, x.v); } + + intvec &operator&=(intvec const &x) { return *this = *this & x; } + intvec &operator|=(intvec const &x) { return *this = *this | x; } + intvec &operator^=(intvec const &x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const { + return vbslq_s32(vreinterpretq_u32_s32(v), x.v, y.v); + } + + intvec_t lsr(int_t n) const { return lsr(IV(n)); } + intvec_t rotate(int_t n) const; + intvec operator>>(int_t n) const { return *this >> IV(n); } + intvec operator<<(int_t n) const { return *this << IV(n); } + intvec &operator>>=(int_t n) { return *this = *this >> n; } + intvec &operator<<=(int_t n) { return *this = *this << n; } + + intvec_t lsr(intvec_t n) const { + return vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v), (-n).v)); + } + intvec_t rotate(intvec_t n) const; + intvec operator>>(intvec n) const { return vshlq_s32(v, (-n).v); } + intvec operator<<(intvec n) const { return vshlq_s32(v, n.v); } + intvec &operator>>=(intvec n) { return *this = *this >> n; } + intvec &operator<<=(intvec n) { return *this = *this << n; } + + intvec_t clz() const { return vclzq_s32(v); } + intvec_t popcount() const { + return vpaddlq_s16(vpaddlq_s8(vcntq_s8(vreinterpretq_s8_s32(v)))); + } + + boolvec_t operator==(intvec const &x) const { return vceqq_s32(v, x.v); } + boolvec_t operator!=(intvec const &x) const { return !(*this == x); } + boolvec_t operator<(intvec const &x) const { return vcltq_s32(v, x.v); } + boolvec_t operator<=(intvec const &x) const { return vcleq_s32(v, x.v); } + boolvec_t operator>(intvec const &x) const { return vcgtq_s32(v, x.v); } + boolvec_t operator>=(intvec const &x) const { return vcgeq_s32(v, x.v); } + + intvec_t abs() const { return vabsq_s32(v); } + boolvec_t isignbit() const { + // return *this < IV(I(0)); + return intvec(vshrq_n_s32(v, FP::bits - 1)).as_bool(); + } + intvec_t max(intvec_t x) const { return vmaxq_s32(v, x.v); } + intvec_t min(intvec_t x) const { return vminq_s32(v, x.v); } +}; + +template <> struct realvec<float, 4> : floatprops<float> { + static int const size = 4; + typedef real_t scalar_t; + typedef float32x4_t vector_t; + static int const alignment = sizeof(vector_t); + + static char const *name() { return "<NEON:4*float>"; } + void barrier() { __asm__("" : "+w"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x) : v(x) {} + realvec(real_t a) : v(vdupq_n_f32(a)) {} + realvec(real_t const *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + + operator vector_t() const { return v; } + real_t operator[](int n) const { + return vecmathlib::get_elt<RV, vector_t, real_t>(v, n); + } + realvec_t &set_elt(int n, real_t a) { + return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this; + } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return vld1q_f32(p); + } + static realvec_t loadu(real_t const *p) { #if defined __ARM_FEATURE_UNALIGNED - return vld1q_f32(p); + return vld1q_f32(p); #else - realvec_t r; - r.set_elt(0, p[0]); - r.set_elt(1, p[1]); - r.set_elt(2, p[2]); - r.set_elt(3, p[3]); - return r; + realvec_t r; + r.set_elt(0, p[0]); + r.set_elt(1, p[1]); + r.set_elt(2, p[2]); + r.set_elt(3, p[3]); + return r; #endif + } + static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff); + return loadu(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - vst1q_f32(p, v); + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); } - void storeu(real_t* p) const - { - // Vector stores would require vector loads, which would need to - // be atomic + } + realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff, m); + return loadu(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + vst1q_f32(p, v); + } + void storeu(real_t *p) const { +// Vector stores would require vector loads, which would need to +// be atomic #if defined __ARM_FEATURE_UNALIGNED - vst1q_f32(p, v); + vst1q_f32(p, v); #else - p[0] = (*this)[0]; - p[1] = (*this)[1]; - p[2] = (*this)[2]; - p[3] = (*this)[3]; + p[0] = (*this)[0]; + p[1] = (*this)[1]; + p[2] = (*this)[2]; + p[3] = (*this)[3]; #endif + } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff); + storeu(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + if (m.m[0]) + p[0] = (*this)[0]; + if (m.m[1]) + p[1] = (*this)[1]; + if (m.m[2]) + p[2] = (*this)[2]; + if (m.m[3]) + p[3] = (*this)[3]; } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return vreinterpretq_s32_f32(v); } - intvec_t convert_int() const { return vcvtq_s32_f32(v); } - - - - realvec operator+() const { return *this; } - realvec operator-() const { return vnegq_f32(v); } - - realvec operator+(realvec x) const { return vaddq_f32(v, x.v); } - realvec operator-(realvec x) const { return vsubq_f32(v, x.v); } - realvec operator*(realvec x) const { return vmulq_f32(v, x.v); } - realvec operator/(realvec x) const { return *this * x.rcp(); } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t maxval() const - { - float32x2_t x = vpmax_f32(vget_low_f32(v), vget_high_f32(v)); - float32x2_t y = vpmax_f32(x, x); - float32_t z = vget_lane_f32(y, 0); - return z; - } - real_t minval() const - { - float32x2_t x = vpmin_f32(vget_low_f32(v), vget_high_f32(v)); - float32x2_t y = vpmin_f32(x, x); - float32_t z = vget_lane_f32(y, 0); - return z; - } - real_t prod() const - { - // TODO: multiply pairwise with 2-vectors - return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; - } - real_t sum() const - { - float32x2_t x = vpadd_f32(vget_low_f32(v), vget_high_f32(v)); - float32x2_t y = vpadd_f32(x, x); - float32_t z = vget_lane_f32(y, 0); - return z; - } - - - - boolvec_t operator==(realvec const& x) const { return vceqq_f32(v, x.v); } - boolvec_t operator!=(realvec const& x) const { return !(*this == x); } - boolvec_t operator<(realvec const& x) const { return vcltq_f32(v, x.v); } - boolvec_t operator<=(realvec const& x) const { return vcleq_f32(v, x.v); } - boolvec_t operator>(realvec const& x) const { return vcgtq_f32(v, x.v); } - boolvec_t operator>=(realvec const& x) const { return vcgeq_f32(v, x.v); } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const - { - // return vrndpq_f32(v); - return MF::vml_ceil(*this); - } - realvec copysign(realvec y) const - { - return vbslq_f32(vdupq_n_u32(FP::signbit_mask), y.v, v); - } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return vabsq_f32(v); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const - { - // return vrndmq_f32(v); - return MF::vml_floor(*this); - } - realvec_t fma(realvec_t y, realvec_t z) const - { - return vfmaq_f32(z.v, v, y.v); - } - realvec fmax(realvec y) const { return vmaxq_f32(v, y.v); } - realvec fmin(realvec y) const { return vminq_f32(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return MF::vml_isnan(*this); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return vmlaq_f32(z.v, v, y.v); - } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const - { - realvec r = vrecpeq_f32(v); - r *= vrecpsq_f32(v, r); - r *= vrecpsq_f32(v, r); - return r; - } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const - { - // return vrndnq_f32(v); - return MF::vml_rint(*this); - } - realvec round() const - { - // return vrndaq_f32(v); - return MF::vml_round(*this); - } - realvec rsqrt() const - { - realvec r = vrsqrteq_f32(v); - r *= vrsqrtsq_f32(v, r*r); - r *= vrsqrtsq_f32(v, r*r); - return r; - } - boolvec_t signbit() const { return MF::vml_signbit(*this); } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - realvec sqrt() const { return *this * rsqrt(); } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const - { - // return vrndq_f32(v); - return MF::vml_trunc(*this); + } + void storeu(real_t *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) + p[0] = (*this)[0]; + if (m.m[1]) + p[1] = (*this)[1]; + if (m.m[2]) + p[2] = (*this)[2]; + if (m.m[3]) + p[3] = (*this)[3]; } - }; - - - - // boolvec definitions - - inline intvec<float,4> boolvec<float,4>::as_int() const - { - return vreinterpretq_s32_u32(v); - } - - inline intvec<float,4> boolvec<float,4>::convert_int() const - { - return - as_int(); - } - - inline - boolvec<float,4> boolvec<float,4>::ifthen(boolvec_t x, boolvec_t y) const - { - return vbslq_u32(v, x.v, y.v); - } - - inline intvec<float,4> boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const - { - return vbslq_s32(v, x.v, y.v); - } - - inline - realvec<float,4> boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const - { - return vbslq_f32(v, x.v, y.v); - } - - - - // intvec definitions - - inline realvec<float,4> intvec<float,4>::as_float() const - { - return vreinterpretq_f32_s32(v); - } - - inline realvec<float,4> intvec<float,4>::convert_float() const - { - return vcvtq_f32_s32(v); - } - - inline intvec<float,4> intvec<float,4>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); - } - - inline intvec<float,4> intvec<float,4>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); - } - + } + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff, m); + storeu(p + ioff, m); + } + + intvec_t as_int() const { return vreinterpretq_s32_f32(v); } + intvec_t convert_int() const { return vcvtq_s32_f32(v); } + + realvec operator+() const { return *this; } + realvec operator-() const { return vnegq_f32(v); } + + realvec operator+(realvec x) const { return vaddq_f32(v, x.v); } + realvec operator-(realvec x) const { return vsubq_f32(v, x.v); } + realvec operator*(realvec x) const { return vmulq_f32(v, x.v); } + realvec operator/(realvec x) const { return *this * x.rcp(); } + + realvec &operator+=(realvec const &x) { return *this = *this + x; } + realvec &operator-=(realvec const &x) { return *this = *this - x; } + realvec &operator*=(realvec const &x) { return *this = *this * x; } + realvec &operator/=(realvec const &x) { return *this = *this / x; } + + real_t maxval() const { + float32x2_t x = vpmax_f32(vget_low_f32(v), vget_high_f32(v)); + float32x2_t y = vpmax_f32(x, x); + float32_t z = vget_lane_f32(y, 0); + return z; + } + real_t minval() const { + float32x2_t x = vpmin_f32(vget_low_f32(v), vget_high_f32(v)); + float32x2_t y = vpmin_f32(x, x); + float32_t z = vget_lane_f32(y, 0); + return z; + } + real_t prod() const { + // TODO: multiply pairwise with 2-vectors + return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; + } + real_t sum() const { + float32x2_t x = vpadd_f32(vget_low_f32(v), vget_high_f32(v)); + float32x2_t y = vpadd_f32(x, x); + float32_t z = vget_lane_f32(y, 0); + return z; + } + + boolvec_t operator==(realvec const &x) const { return vceqq_f32(v, x.v); } + boolvec_t operator!=(realvec const &x) const { return !(*this == x); } + boolvec_t operator<(realvec const &x) const { return vcltq_f32(v, x.v); } + boolvec_t operator<=(realvec const &x) const { return vcleq_f32(v, x.v); } + boolvec_t operator>(realvec const &x) const { return vcgtq_f32(v, x.v); } + boolvec_t operator>=(realvec const &x) const { return vcgeq_f32(v, x.v); } + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const { + // return vrndpq_f32(v); + return MF::vml_ceil(*this); + } + realvec copysign(realvec y) const { + return vbslq_f32(vdupq_n_u32(FP::signbit_mask), y.v, v); + } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return vabsq_f32(v); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const { + // return vrndmq_f32(v); + return MF::vml_floor(*this); + } + realvec_t fma(realvec_t y, realvec_t z) const { + return vfmaq_f32(z.v, v, y.v); + } + realvec fmax(realvec y) const { return vmaxq_f32(v, y.v); } + realvec fmin(realvec y) const { return vminq_f32(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec_t mad(realvec_t y, realvec_t z) const { + return vmlaq_f32(z.v, v, y.v); + } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const { + realvec r = vrecpeq_f32(v); + r *= vrecpsq_f32(v, r); + r *= vrecpsq_f32(v, r); + return r; + } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const { + // return vrndnq_f32(v); + return MF::vml_rint(*this); + } + realvec round() const { + // return vrndaq_f32(v); + return MF::vml_round(*this); + } + realvec rsqrt() const { + realvec r = vrsqrteq_f32(v); + r *= vrsqrtsq_f32(v, r * r); + r *= vrsqrtsq_f32(v, r * r); + return r; + } + boolvec_t signbit() const { return MF::vml_signbit(*this); } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + realvec sqrt() const { return *this * rsqrt(); } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const { + // return vrndq_f32(v); + return MF::vml_trunc(*this); + } +}; + +// boolvec definitions + +inline intvec<float, 4> boolvec<float, 4>::as_int() const { + return vreinterpretq_s32_u32(v); +} + +inline intvec<float, 4> boolvec<float, 4>::convert_int() const { + return -as_int(); +} + +inline boolvec<float, 4> boolvec<float, 4>::ifthen(boolvec_t x, + boolvec_t y) const { + return vbslq_u32(v, x.v, y.v); +} + +inline intvec<float, 4> boolvec<float, 4>::ifthen(intvec_t x, + intvec_t y) const { + return vbslq_s32(v, x.v, y.v); +} + +inline realvec<float, 4> boolvec<float, 4>::ifthen(realvec_t x, + realvec_t y) const { + return vbslq_f32(v, x.v, y.v); +} + +// intvec definitions + +inline realvec<float, 4> intvec<float, 4>::as_float() const { + return vreinterpretq_f32_s32(v); +} + +inline realvec<float, 4> intvec<float, 4>::convert_float() const { + return vcvtq_f32_s32(v); +} + +inline intvec<float, 4> intvec<float, 4>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +inline intvec<float, 4> intvec<float, 4>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + } // namespace vecmathlib -#endif // #ifndef VEC_NEON_FLOAT4_H +#endif // #ifndef VEC_NEON_FLOAT4_H diff --git a/vec_pseudo.h b/vec_pseudo.h index 2aafc23..c4cbbc1 100644 --- a/vec_pseudo.h +++ b/vec_pseudo.h @@ -12,1668 +12,1492 @@ #include <climits> #include <cstdlib> #ifndef VML_NO_IOSTREAM -# include <sstream> +#include <sstream> #endif #include <string> +namespace vecmathlib { +template <typename T, int N> struct boolpseudovec; +template <typename T, int N> struct intpseudovec; +template <typename T, int N> struct realpseudovec; -namespace vecmathlib { - - template<typename T, int N> struct boolpseudovec; - template<typename T, int N> struct intpseudovec; - template<typename T, int N> struct realpseudovec; - - - - template<typename T, int N> - struct boolpseudovec: floatprops<T> - { - typedef typename floatprops<T>::int_t int_t; - typedef typename floatprops<T>::uint_t uint_t; - typedef typename floatprops<T>::real_t real_t; - - static int const size = N; - typedef bool scalar_t; - typedef bool bvector_t[size]; - static int const alignment = sizeof(bool); - - typedef boolpseudovec boolvec_t; - typedef intpseudovec<real_t, size> intvec_t; - typedef realpseudovec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolpseudovec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolpseudovec(boolpseudovec const& x): v(x.v) {} - // boolpseudovec& operator=(boolpseudovec const& x) { return v=x.v, *this; } - boolpseudovec(bool a) { for (int d=0; d<size; ++d) v[d]=a; } - boolpseudovec(bool const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; } - - bool operator[](int n) const { return v[n]; } - boolvec_t& set_elt(int n, bool a) { return v[n]=a, *this; } - - - - intvec_t as_int() const; // defined after intpseudovec - intvec_t convert_int() const; // defined after intpseudovec - - - - boolvec_t operator!() const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = !v[d]; - return res; - } - - boolvec_t operator&&(boolvec_t x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] && x.v[d]; - return res; - } - boolvec_t operator||(boolvec_t x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] || x.v[d]; - return res; - } - boolvec_t operator==(boolvec_t x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d]; - return res; - } - boolvec_t operator!=(boolvec_t x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d]; - return res; - } - - bool all() const - { - bool res = v[0]; - for (int d=1; d<size; ++d) res = res && v[d]; - return res; - } - bool any() const - { - bool res = v[0]; - for (int d=1; d<size; ++d) res = res || v[d]; - return res; - } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intpseudovec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realpseudovec - }; - - - - template<typename T, int N> - struct intpseudovec: floatprops<T> - { - typedef typename floatprops<T>::int_t int_t; - typedef typename floatprops<T>::uint_t uint_t; - typedef typename floatprops<T>::real_t real_t; - - static int const size = N; - typedef int_t scalar_t; - typedef int_t ivector_t[size]; - static int const alignment = sizeof(int_t); - - typedef boolpseudovec<real_t, size> boolvec_t; - typedef intpseudovec intvec_t; - typedef realpseudovec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intpseudovec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intpseudovec(intpseudovec const& x): v(x.v) {} - // intpseudovec& operator=(intpseudovec const& x) { return v=x.v, *this; } - intpseudovec(int_t a) { for (int d=0; d<size; ++d) v[d]=a; } - intpseudovec(int_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; } - static intvec_t iota() - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d]=d; - return res; - } - - int_t operator[](int n) const { return v[n]; } - intvec_t& set_elt(int n, int_t a) { return v[n]=a, *this; } - - - - boolvec_t as_bool() const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d]=v[d]; - return res; - } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d]; - return res; - } - realvec_t as_float() const; // defined after realpseudovec - realvec_t convert_float() const; // defined after realpseudovec - - - - intvec_t operator+() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = + v[d]; - return res; - } - intvec_t operator-() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = - v[d]; - return res; - } - - intvec_t& operator+=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] += x.v[d]; - return *this; - } - intvec_t& operator-=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] -= x.v[d]; - return *this; - } - intvec_t& operator*=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] *= x.v[d]; - return *this; - } - intvec_t& operator/=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] /= x.v[d]; - return *this; - } - intvec_t& operator%=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] %= x.v[d]; - return *this; - } - - intvec_t operator+(intvec_t x) const - { - intvec_t res = *this; - return res += x; - } - intvec_t operator-(intvec_t x) const - { - intvec_t res = *this; - return res -= x; - } - intvec_t operator*(intvec_t x) const - { - intvec_t res = *this; - return res *= x; - } - intvec_t operator/(intvec_t x) const - { - intvec_t res = *this; - return res /= x; - } - intvec_t operator%(intvec_t x) const - { - intvec_t res = *this; - return res %= x; - } - - - - intvec_t operator~() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = ~ v[d]; - return res; - } - - intvec_t& operator&=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] &= x.v[d]; - return *this; - } - intvec_t& operator|=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] |= x.v[d]; - return *this; - } - intvec_t& operator^=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] ^= x.v[d]; - return *this; - } - - intvec_t operator&(intvec_t x) const - { - intvec_t res = *this; - return res &= x; - } - intvec_t operator|(intvec_t x) const - { - intvec_t res = *this; - return res |= x; - } - intvec_t operator^(intvec_t x) const - { - intvec_t res = *this; - return res ^= x; - } - - intvec_t bitifthen(intvec_t x, intvec_t y) const; - - - - intvec_t lsr(int_t n) const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n)); - return res; - } - intvec_t rotate(int_t n) const; - intvec_t& operator>>=(int_t n) - { - for (int d=0; d<size; ++d) v[d] >>= n; - return *this; - } - intvec_t& operator<<=(int_t n) - { - for (int d=0; d<size; ++d) v[d] <<= n; - return *this; - } - intvec_t operator>>(int_t n) const - { - intvec_t res = *this; - return res >>= n; - } - intvec_t operator<<(int_t n) const - { - intvec_t res = *this; - return res <<= n; - } - - intvec_t lsr(intvec_t n) const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n.v[d])); - return res; - } - intvec_t rotate(intvec_t n) const; - intvec_t& operator>>=(intvec_t n) - { - for (int d=0; d<size; ++d) v[d] >>= n.v[d]; - return *this; - } - intvec_t& operator<<=(intvec_t n) - { - for (int d=0; d<size; ++d) v[d] <<= n.v[d]; - return *this; - } - intvec_t operator>>(intvec_t n) const - { - intvec_t res = *this; - return res >>= n; - } - intvec_t operator<<(intvec_t n) const - { - intvec_t res = *this; - return res <<= n; - } - - intvec_t clz() const - { - intvec_t res; +template <typename T, int N> struct boolpseudovec : floatprops<T> { + typedef typename floatprops<T>::int_t int_t; + typedef typename floatprops<T>::uint_t uint_t; + typedef typename floatprops<T>::real_t real_t; + + static int const size = N; + typedef bool scalar_t; + typedef bool bvector_t[size]; + static int const alignment = sizeof(bool); + + typedef boolpseudovec boolvec_t; + typedef intpseudovec<real_t, size> intvec_t; + typedef realpseudovec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolpseudovec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolpseudovec(boolpseudovec const& x): v(x.v) {} + // boolpseudovec& operator=(boolpseudovec const& x) { return v=x.v, *this; } + boolpseudovec(bool a) { + for (int d = 0; d < size; ++d) + v[d] = a; + } + boolpseudovec(bool const *as) { + for (int d = 0; d < size; ++d) + v[d] = as[d]; + } + + bool operator[](int n) const { return v[n]; } + boolvec_t &set_elt(int n, bool a) { return v[n] = a, *this; } + + intvec_t as_int() const; // defined after intpseudovec + intvec_t convert_int() const; // defined after intpseudovec + + boolvec_t operator!() const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = !v[d]; + return res; + } + + boolvec_t operator&&(boolvec_t x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] && x.v[d]; + return res; + } + boolvec_t operator||(boolvec_t x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] || x.v[d]; + return res; + } + boolvec_t operator==(boolvec_t x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] == x.v[d]; + return res; + } + boolvec_t operator!=(boolvec_t x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] != x.v[d]; + return res; + } + + bool all() const { + bool res = v[0]; + for (int d = 1; d < size; ++d) + res = res && v[d]; + return res; + } + bool any() const { + bool res = v[0]; + for (int d = 1; d < size; ++d) + res = res || v[d]; + return res; + } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intpseudovec + realvec_t ifthen(realvec_t x, + realvec_t y) const; // defined after realpseudovec +}; + +template <typename T, int N> struct intpseudovec : floatprops<T> { + typedef typename floatprops<T>::int_t int_t; + typedef typename floatprops<T>::uint_t uint_t; + typedef typename floatprops<T>::real_t real_t; + + static int const size = N; + typedef int_t scalar_t; + typedef int_t ivector_t[size]; + static int const alignment = sizeof(int_t); + + typedef boolpseudovec<real_t, size> boolvec_t; + typedef intpseudovec intvec_t; + typedef realpseudovec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intpseudovec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intpseudovec(intpseudovec const& x): v(x.v) {} + // intpseudovec& operator=(intpseudovec const& x) { return v=x.v, *this; } + intpseudovec(int_t a) { + for (int d = 0; d < size; ++d) + v[d] = a; + } + intpseudovec(int_t const *as) { + for (int d = 0; d < size; ++d) + v[d] = as[d]; + } + static intvec_t iota() { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = d; + return res; + } + + int_t operator[](int n) const { return v[n]; } + intvec_t &set_elt(int n, int_t a) { return v[n] = a, *this; } + + boolvec_t as_bool() const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d]; + return res; + } + boolvec_t convert_bool() const { + // Result: convert_bool(0)=false, convert_bool(else)=true + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d]; + return res; + } + realvec_t as_float() const; // defined after realpseudovec + realvec_t convert_float() const; // defined after realpseudovec + + intvec_t operator+() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = +v[d]; + return res; + } + intvec_t operator-() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = -v[d]; + return res; + } + + intvec_t &operator+=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] += x.v[d]; + return *this; + } + intvec_t &operator-=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] -= x.v[d]; + return *this; + } + intvec_t &operator*=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] *= x.v[d]; + return *this; + } + intvec_t &operator/=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] /= x.v[d]; + return *this; + } + intvec_t &operator%=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] %= x.v[d]; + return *this; + } + + intvec_t operator+(intvec_t x) const { + intvec_t res = *this; + return res += x; + } + intvec_t operator-(intvec_t x) const { + intvec_t res = *this; + return res -= x; + } + intvec_t operator*(intvec_t x) const { + intvec_t res = *this; + return res *= x; + } + intvec_t operator/(intvec_t x) const { + intvec_t res = *this; + return res /= x; + } + intvec_t operator%(intvec_t x) const { + intvec_t res = *this; + return res %= x; + } + + intvec_t operator~() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = ~v[d]; + return res; + } + + intvec_t &operator&=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] &= x.v[d]; + return *this; + } + intvec_t &operator|=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] |= x.v[d]; + return *this; + } + intvec_t &operator^=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] ^= x.v[d]; + return *this; + } + + intvec_t operator&(intvec_t x) const { + intvec_t res = *this; + return res &= x; + } + intvec_t operator|(intvec_t x) const { + intvec_t res = *this; + return res |= x; + } + intvec_t operator^(intvec_t x) const { + intvec_t res = *this; + return res ^= x; + } + + intvec_t bitifthen(intvec_t x, intvec_t y) const; + + intvec_t lsr(int_t n) const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = I(U(v[d]) >> U(n)); + return res; + } + intvec_t rotate(int_t n) const; + intvec_t &operator>>=(int_t n) { + for (int d = 0; d < size; ++d) + v[d] >>= n; + return *this; + } + intvec_t &operator<<=(int_t n) { + for (int d = 0; d < size; ++d) + v[d] <<= n; + return *this; + } + intvec_t operator>>(int_t n) const { + intvec_t res = *this; + return res >>= n; + } + intvec_t operator<<(int_t n) const { + intvec_t res = *this; + return res <<= n; + } + + intvec_t lsr(intvec_t n) const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = I(U(v[d]) >> U(n.v[d])); + return res; + } + intvec_t rotate(intvec_t n) const; + intvec_t &operator>>=(intvec_t n) { + for (int d = 0; d < size; ++d) + v[d] >>= n.v[d]; + return *this; + } + intvec_t &operator<<=(intvec_t n) { + for (int d = 0; d < size; ++d) + v[d] <<= n.v[d]; + return *this; + } + intvec_t operator>>(intvec_t n) const { + intvec_t res = *this; + return res >>= n; + } + intvec_t operator<<(intvec_t n) const { + intvec_t res = *this; + return res <<= n; + } + + intvec_t clz() const { + intvec_t res; #if defined __clang__ || defined __gcc__ - for (int d=0; d<size; ++d) { - if (v[d] == 0) { - res.v[d] = CHAR_BIT * sizeof v[d]; + for (int d = 0; d < size; ++d) { + if (v[d] == 0) { + res.v[d] = CHAR_BIT * sizeof v[d]; + } else { + if (sizeof v[d] == sizeof(long long)) { + res.v[d] = __builtin_clzll(v[d]); + } else if (sizeof v[d] == sizeof(long)) { + res.v[d] = __builtin_clzl(v[d]); + } else if (sizeof v[d] == sizeof(int)) { + res.v[d] = __builtin_clz(v[d]); + } else if (sizeof v[d] == sizeof(short)) { + res.v[d] = __builtin_clzs(v[d]); + } else if (sizeof v[d] == sizeof(char)) { + res.v[d] = __builtin_clzs((unsigned short)(unsigned char)v[d]) - + CHAR_BIT * (sizeof(short) - sizeof(char)); } else { - if (sizeof v[d] == sizeof(long long)) { - res.v[d] = __builtin_clzll(v[d]); - } else if (sizeof v[d] == sizeof(long)) { - res.v[d] = __builtin_clzl(v[d]); - } else if (sizeof v[d] == sizeof(int)) { - res.v[d] = __builtin_clz(v[d]); - } else if (sizeof v[d] == sizeof(short)) { - res.v[d] = __builtin_clzs(v[d]); - } else if (sizeof v[d] == sizeof(char)) { - res.v[d] = - __builtin_clzs((unsigned short)(unsigned char)v[d]) - - CHAR_BIT * (sizeof(short) - sizeof(char)); - } else { - __builtin_unreachable(); - } + __builtin_unreachable(); } } + } #else - res = MF::vml_clz(*this); + res = MF::vml_clz(*this); #endif - return res; - } - intvec_t popcount() const - { - intvec_t res; + return res; + } + intvec_t popcount() const { + intvec_t res; #if defined __clang__ || defined __gcc__ - if (sizeof(int_t) == sizeof(long long)) { - for (int d=0; d<size; ++d) res.v[d] = __builtin_popcountll(v[d]); - } else if (sizeof(int_t) == sizeof(long)) { - for (int d=0; d<size; ++d) res.v[d] = __builtin_popcountl(v[d]); - } else if (sizeof(int_t) <= sizeof(int)) { - for (int d=0; d<size; ++d) res.v[d] = __builtin_popcount(v[d]); - } else { - __builtin_unreachable(); - } + if (sizeof(int_t) == sizeof(long long)) { + for (int d = 0; d < size; ++d) + res.v[d] = __builtin_popcountll(v[d]); + } else if (sizeof(int_t) == sizeof(long)) { + for (int d = 0; d < size; ++d) + res.v[d] = __builtin_popcountl(v[d]); + } else if (sizeof(int_t) <= sizeof(int)) { + for (int d = 0; d < size; ++d) + res.v[d] = __builtin_popcount(v[d]); + } else { + __builtin_unreachable(); + } #else - res = MF::vml_popcount(*this); + res = MF::vml_popcount(*this); #endif - return res; - } - - - - boolvec_t operator==(intvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d]; - return res; - } - boolvec_t operator!=(intvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d]; - return res; - } - boolvec_t operator<(intvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d]; - return res; - } - boolvec_t operator<=(intvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d]; - return res; - } - boolvec_t operator>(intvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d]; - return res; - } - boolvec_t operator>=(intvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d]; - return res; - } - - intvec_t abs() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = std::abs(v[d]); - return res; - } - - boolvec_t isignbit() const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] < 0; - return res; - } - - intvec_t max(intvec_t x) const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = std::max(v[d], x.v[d]); - return res; - } - - intvec_t min(intvec_t x) const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = std::min(v[d], x.v[d]); - return res; - } - }; - - - - template<typename T, int N> - struct realpseudovec: floatprops<T> - { - typedef typename floatprops<T>::int_t int_t; - typedef typename floatprops<T>::uint_t uint_t; - typedef typename floatprops<T>::real_t real_t; - - static int const size = N; - typedef real_t scalar_t; - typedef real_t vector_t[size]; - static int const alignment = sizeof(real_t); - + return res; + } + + boolvec_t operator==(intvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] == x.v[d]; + return res; + } + boolvec_t operator!=(intvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] != x.v[d]; + return res; + } + boolvec_t operator<(intvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] < x.v[d]; + return res; + } + boolvec_t operator<=(intvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] <= x.v[d]; + return res; + } + boolvec_t operator>(intvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] > x.v[d]; + return res; + } + boolvec_t operator>=(intvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] >= x.v[d]; + return res; + } + + intvec_t abs() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = std::abs(v[d]); + return res; + } + + boolvec_t isignbit() const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] < 0; + return res; + } + + intvec_t max(intvec_t x) const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = std::max(v[d], x.v[d]); + return res; + } + + intvec_t min(intvec_t x) const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = std::min(v[d], x.v[d]); + return res; + } +}; + +template <typename T, int N> struct realpseudovec : floatprops<T> { + typedef typename floatprops<T>::int_t int_t; + typedef typename floatprops<T>::uint_t uint_t; + typedef typename floatprops<T>::real_t real_t; + + static int const size = N; + typedef real_t scalar_t; + typedef real_t vector_t[size]; + static int const alignment = sizeof(real_t); + #ifndef VML_NO_IOSTREAM - static char const* name() - { - static std::string name_; - if (name_.empty()) { - std::stringstream buf; - buf << "<libm:" << N << "*" << FP::name() << ">"; - name_ = buf.str(); - } - return name_.c_str(); + static char const *name() { + static std::string name_; + if (name_.empty()) { + std::stringstream buf; + buf << "<libm:" << N << "*" << FP::name() << ">"; + name_ = buf.str(); } + return name_.c_str(); + } #endif - void barrier() - { + void barrier() { #if defined __GNUC__ && !defined __clang__ && !defined __ICC - // GCC crashes when +X is used as constraint -# if defined __SSE2__ - for (int d=0; d<size; ++d) __asm__("": "+x"(v[d])); -# elif defined __PPC64__ // maybe also __PPC__ - for (int d=0; d<size; ++d) __asm__("": "+f"(v[d])); -# elif defined __arm__ - for (int d=0; d<size; ++d) __asm__("": "+w"(v[d])); -# else -# error "Floating point barrier undefined on this architecture" -# endif +// GCC crashes when +X is used as constraint +#if defined __SSE2__ + for (int d = 0; d < size; ++d) + __asm__("" : "+x"(v[d])); +#elif defined __PPC64__ // maybe also __PPC__ + for (int d = 0; d < size; ++d) + __asm__("" : "+f"(v[d])); +#elif defined __arm__ + for (int d = 0; d < size; ++d) + __asm__("" : "+w"(v[d])); +#else +#error "Floating point barrier undefined on this architecture" +#endif #elif defined __clang__ - for (int d=0; d<size; ++d) __asm__("": "+X"(v[d])); + for (int d = 0; d < size; ++d) + __asm__("" : "+X"(v[d])); #elif defined __ICC - for (int d=0; d<size; ++d) { - real_t tmp = v[d]; - __asm__("": "+X"(tmp)); - v[d] = tmp; - } + for (int d = 0; d < size; ++d) { + real_t tmp = v[d]; + __asm__("" : "+X"(tmp)); + v[d] = tmp; + } #elif defined __IBMCPP__ - for (int d=0; d<size; ++d) __asm__("": "+f"(v[d])); + for (int d = 0; d < size; ++d) + __asm__("" : "+f"(v[d])); #else -# error "Floating point barrier undefined on this architecture" +#error "Floating point barrier undefined on this architecture" #endif - } - - typedef boolpseudovec<real_t, size> boolvec_t; - typedef intpseudovec<real_t, size> intvec_t; - typedef realpseudovec realvec_t; - - private: - boolvec_t mapb(bool f(real_t)) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = f(v[d]); - return res; - } - intvec_t map(int_t f(real_t)) const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = f(v[d]); - return res; - } - realvec_t map(real_t f(real_t)) const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = f(v[d]); - return res; - } - realvec_t map(real_t f(real_t, int_t), intvec_t x) const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]); - return res; - } - realvec_t map(real_t f(real_t, real_t), realvec_t x) const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d]); - return res; - } - realvec_t map(real_t f(real_t, real_t, real_t), - realvec_t x, realvec_t y) const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = f(v[d], x.v[d], y.v[d]); - return res; - } - public: - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realpseudovec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realpseudovec(realpseudovec const& x): v(x.v) {} - // realpseudovec& operator=(realpseudovec const& x) { return v=x.v, *this; } - realpseudovec(real_t a) { for (int d=0; d<size; ++d) v[d]=a; } - realpseudovec(real_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; } - - real_t operator[](int n) const { return v[n]; } - realvec_t& set_elt(int n, real_t a) { return v[n]=a, *this; } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loadu(p); - } - static realvec_t loadu(real_t const* p) - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = p[d]; - return res; - } - static realvec_t loadu(real_t const* p, size_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - return m.m.ifthen(loada(p), *this); - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - return m.m.ifthen(loadu(p), *this); - } - realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const - { - return m.m.ifthen(loadu(p, ioff), *this); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storeu(p); - } - void storeu(real_t* p) const - { - for (int d=0; d<size; ++d) p[d] = v[d]; - } - void storeu(real_t* p, size_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storeu(p, m); - } - void storeu(real_t* p, mask_t const& m) const - { - for (int d=0; d<size; ++d) if (m.m[d]) p[d] = v[d]; - } - void storeu(real_t* p, size_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = FP::as_int(v[d]); - return res; - } - intvec_t convert_int() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = FP::convert_int(v[d]); - return res; - } - - - - realvec_t operator+() const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = + v[d]; - return res; - } - realvec_t operator-() const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = - v[d]; - return res; - } - - realvec_t& operator+=(realvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] += x.v[d]; - return *this; - } - realvec_t& operator-=(realvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] -= x.v[d]; - return *this; - } - realvec_t& operator*=(realvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] *= x.v[d]; - return *this; - } - realvec_t& operator/=(realvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] /= x.v[d]; - return *this; - } - - realvec_t operator+(realvec_t x) const - { - realvec_t res = *this; - return res += x; - } - realvec_t operator-(realvec_t x) const - { - realvec_t res = *this; - return res -= x; - } - realvec_t operator*(realvec_t x) const - { - realvec_t res = *this; - return res *= x; - } - realvec_t operator/(realvec_t x) const - { - realvec_t res = *this; - return res /= x; - } - - real_t maxval() const - { - real_t res = v[0]; - for (int d=1; d<size; ++d) res = vml_std::fmax(res, v[d]); - return res; - } - real_t minval() const - { - real_t res = v[0]; - for (int d=1; d<size; ++d) res = vml_std::fmin(res, v[d]); - return res; - } - real_t prod() const - { - real_t res = v[0]; - for (int d=1; d<size; ++d) res *= v[d]; - return res; - } - real_t sum() const - { - real_t res = v[0]; - for (int d=1; d<size; ++d) res += v[d]; - return res; - } - - - - boolvec_t operator==(realvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d]; - return res; - } - boolvec_t operator!=(realvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d]; - return res; - } - boolvec_t operator<(realvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d]; - return res; - } - boolvec_t operator<=(realvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d]; - return res; - } - boolvec_t operator>(realvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d]; - return res; - } - boolvec_t operator>=(realvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d]; - return res; - } - - - - realvec_t acos() const { return map(vml_std::acos); } - realvec_t acosh() const { return map(vml_std::acosh); } - realvec_t asin() const { return map(vml_std::asin); } - realvec_t asinh() const { return map(vml_std::asinh); } - realvec_t atan() const { return map(vml_std::atan); } - realvec_t atan2(realvec_t y) const - { - return MF::vml_atan2(*this, y); - } - realvec_t atanh() const { return map(vml_std::atanh); } - realvec_t cbrt() const { return map(vml_std::cbrt); } - realvec_t ceil() const { return map(vml_std::ceil); } - realvec_t copysign(realvec_t y) const - { - return map(vml_std::copysign, y); - } - realvec_t cos() const { return map(vml_std::cos); } - realvec_t cosh() const { return map(vml_std::cosh); } - realvec_t exp() const { return map(vml_std::exp); } - realvec_t exp10() const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = vml_std::exp(R(M_LN10) * v[d]); - return res; - } - realvec_t exp2() const { return map(vml_std::exp2); } - realvec_t expm1() const { return map(vml_std::expm1); } - realvec_t fabs() const { return map(vml_std::fabs); } - realvec_t fdim(realvec_t y) const { return map(vml_std::fdim, y); } - realvec_t floor() const { return map(vml_std::floor); } - realvec_t fma(realvec_t y, realvec_t z) const - { - return map(vml_std::fma, y, z); - } - realvec_t fmax(realvec_t y) const { return map(vml_std::fmax, y); } - realvec_t fmin(realvec_t y) const { return map(vml_std::fmin, y); } - realvec_t fmod(realvec_t y) const { return map(vml_std::fmod, y); } - realvec_t frexp(intvec_t* ires) const - { - realvec_t res; - for (int d=0; d<size; ++d) { - int iri; - real_t r = vml_std::frexp(v[d], &iri); - int_t ir = iri; + } + + typedef boolpseudovec<real_t, size> boolvec_t; + typedef intpseudovec<real_t, size> intvec_t; + typedef realpseudovec realvec_t; + +private: + boolvec_t mapb(bool f(real_t)) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = f(v[d]); + return res; + } + intvec_t map(int_t f(real_t)) const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = f(v[d]); + return res; + } + realvec_t map(real_t f(real_t)) const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = f(v[d]); + return res; + } + realvec_t map(real_t f(real_t, int_t), intvec_t x) const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = f(v[d], x.v[d]); + return res; + } + realvec_t map(real_t f(real_t, real_t), realvec_t x) const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = f(v[d], x.v[d]); + return res; + } + realvec_t map(real_t f(real_t, real_t, real_t), realvec_t x, + realvec_t y) const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = f(v[d], x.v[d], y.v[d]); + return res; + } + +public: + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realpseudovec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realpseudovec(realpseudovec const& x): v(x.v) {} + // realpseudovec& operator=(realpseudovec const& x) { return v=x.v, *this; } + realpseudovec(real_t a) { + for (int d = 0; d < size; ++d) + v[d] = a; + } + realpseudovec(real_t const *as) { + for (int d = 0; d < size; ++d) + v[d] = as[d]; + } + + real_t operator[](int n) const { return v[n]; } + realvec_t &set_elt(int n, real_t a) { return v[n] = a, *this; } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loadu(p); + } + static realvec_t loadu(real_t const *p) { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = p[d]; + return res; + } + static realvec_t loadu(real_t const *p, size_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loadu(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + return m.m.ifthen(loada(p), *this); + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + return m.m.ifthen(loadu(p), *this); + } + realvec_t loadu(real_t const *p, size_t ioff, mask_t const &m) const { + return m.m.ifthen(loadu(p, ioff), *this); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storeu(p); + } + void storeu(real_t *p) const { + for (int d = 0; d < size; ++d) + p[d] = v[d]; + } + void storeu(real_t *p, size_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storeu(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storeu(p, m); + } + void storeu(real_t *p, mask_t const &m) const { + for (int d = 0; d < size; ++d) + if (m.m[d]) + p[d] = v[d]; + } + void storeu(real_t *p, size_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storeu(p + ioff, m); + } + + intvec_t as_int() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = FP::as_int(v[d]); + return res; + } + intvec_t convert_int() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = FP::convert_int(v[d]); + return res; + } + + realvec_t operator+() const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = +v[d]; + return res; + } + realvec_t operator-() const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = -v[d]; + return res; + } + + realvec_t &operator+=(realvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] += x.v[d]; + return *this; + } + realvec_t &operator-=(realvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] -= x.v[d]; + return *this; + } + realvec_t &operator*=(realvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] *= x.v[d]; + return *this; + } + realvec_t &operator/=(realvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] /= x.v[d]; + return *this; + } + + realvec_t operator+(realvec_t x) const { + realvec_t res = *this; + return res += x; + } + realvec_t operator-(realvec_t x) const { + realvec_t res = *this; + return res -= x; + } + realvec_t operator*(realvec_t x) const { + realvec_t res = *this; + return res *= x; + } + realvec_t operator/(realvec_t x) const { + realvec_t res = *this; + return res /= x; + } + + real_t maxval() const { + real_t res = v[0]; + for (int d = 1; d < size; ++d) + res = vml_std::fmax(res, v[d]); + return res; + } + real_t minval() const { + real_t res = v[0]; + for (int d = 1; d < size; ++d) + res = vml_std::fmin(res, v[d]); + return res; + } + real_t prod() const { + real_t res = v[0]; + for (int d = 1; d < size; ++d) + res *= v[d]; + return res; + } + real_t sum() const { + real_t res = v[0]; + for (int d = 1; d < size; ++d) + res += v[d]; + return res; + } + + boolvec_t operator==(realvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] == x.v[d]; + return res; + } + boolvec_t operator!=(realvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] != x.v[d]; + return res; + } + boolvec_t operator<(realvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] < x.v[d]; + return res; + } + boolvec_t operator<=(realvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] <= x.v[d]; + return res; + } + boolvec_t operator>(realvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] > x.v[d]; + return res; + } + boolvec_t operator>=(realvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] >= x.v[d]; + return res; + } + + realvec_t acos() const { return map(vml_std::acos); } + realvec_t acosh() const { return map(vml_std::acosh); } + realvec_t asin() const { return map(vml_std::asin); } + realvec_t asinh() const { return map(vml_std::asinh); } + realvec_t atan() const { return map(vml_std::atan); } + realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } + realvec_t atanh() const { return map(vml_std::atanh); } + realvec_t cbrt() const { return map(vml_std::cbrt); } + realvec_t ceil() const { return map(vml_std::ceil); } + realvec_t copysign(realvec_t y) const { return map(vml_std::copysign, y); } + realvec_t cos() const { return map(vml_std::cos); } + realvec_t cosh() const { return map(vml_std::cosh); } + realvec_t exp() const { return map(vml_std::exp); } + realvec_t exp10() const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = vml_std::exp(R(M_LN10) * v[d]); + return res; + } + realvec_t exp2() const { return map(vml_std::exp2); } + realvec_t expm1() const { return map(vml_std::expm1); } + realvec_t fabs() const { return map(vml_std::fabs); } + realvec_t fdim(realvec_t y) const { return map(vml_std::fdim, y); } + realvec_t floor() const { return map(vml_std::floor); } + realvec_t fma(realvec_t y, realvec_t z) const { + return map(vml_std::fma, y, z); + } + realvec_t fmax(realvec_t y) const { return map(vml_std::fmax, y); } + realvec_t fmin(realvec_t y) const { return map(vml_std::fmin, y); } + realvec_t fmod(realvec_t y) const { return map(vml_std::fmod, y); } + realvec_t frexp(intvec_t *ires) const { + realvec_t res; + for (int d = 0; d < size; ++d) { + int iri; + real_t r = vml_std::frexp(v[d], &iri); + int_t ir = iri; #if defined VML_HAVE_INF - if (vml_std::isinf(v[d])) ir = std::numeric_limits<int_t>::max(); + if (vml_std::isinf(v[d])) + ir = std::numeric_limits<int_t>::max(); #endif #if defined VML_HAVE_NAN - if (vml_std::isnan(v[d])) ir = std::numeric_limits<int_t>::min(); + if (vml_std::isnan(v[d])) + ir = std::numeric_limits<int_t>::min(); #endif - res.v[d] = r; - ires->v[d] = ir; - } - return res; + res.v[d] = r; + ires->v[d] = ir; } - realvec_t hypot(realvec_t y) const { return map(vml_std::hypot, y); } - intvec_t ilogb() const - { - intvec_t res; - for (int d=0; d<size; ++d) { - int_t r = vml_std::ilogb(v[d]); - typedef std::numeric_limits<int_t> NL; - if (FP_ILOGB0 != NL::min() and v[d] == R(0.0)) { - r = NL::min(); + return res; + } + realvec_t hypot(realvec_t y) const { return map(vml_std::hypot, y); } + intvec_t ilogb() const { + intvec_t res; + for (int d = 0; d < size; ++d) { + int_t r = vml_std::ilogb(v[d]); + typedef std::numeric_limits<int_t> NL; + if (FP_ILOGB0 != NL::min() and v[d] == R(0.0)) { + r = NL::min(); #if defined VML_HAVE_INF - } else if (INT_MAX != NL::max() and vml_std::isinf(v[d])) { - r = NL::max(); + } else if (INT_MAX != NL::max() and vml_std::isinf(v[d])) { + r = NL::max(); #endif #if defined VML_HAVE_NAN - } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v[d])) { - r = NL::min(); + } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v[d])) { + r = NL::min(); #endif - } - res.v[d] = r; } - return res; + res.v[d] = r; } - boolvec_t isfinite() const { return mapb(vml_std::isfinite); } - boolvec_t isinf() const { return mapb(vml_std::isinf); } - boolvec_t isnan() const { return mapb(vml_std::isnan); } - boolvec_t isnormal() const { return mapb(vml_std::isnormal); } - realvec_t ldexp(int_t n) const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = vml_std::ldexp(v[d], n); - return res; - } - realvec_t ldexp(intvec_t n) const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = vml_std::ldexp(v[d], n.v[d]); - return res; - } - realvec_t log() const { return map(vml_std::log); } - realvec_t log10() const { return map(vml_std::log10); } - realvec_t log1p() const { return map(vml_std::log1p); } - realvec_t log2() const { return map(vml_std::log2); } - intvec_t lrint() const - { - realvec_t res; - if (sizeof(int_t) <= sizeof(long)) { - for (int d=0; d<size; ++d) res.v[d] = vml_std::lrint(v[d]); - } else if (sizeof(int_t) <= sizeof(long long)) { - for (int d=0; d<size; ++d) res.v[d] = vml_std::llrint(v[d]); - } else { - __builtin_unreachable(); - } - return res; - } - realvec_t mad(realvec_t y, realvec_t z) const - { - return MF::vml_mad(*this, y, z); - } - realvec_t nextafter(realvec_t y) const - { - return map(vml_std::nextafter, y); - } - realvec_t pow(realvec_t y) const { return map(vml_std::pow, y); } - realvec_t rcp() const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = R(1.0) / v[d]; - return res; - } - realvec_t remainder(realvec_t y) const - { - return map(vml_std::remainder, y); - } - realvec_t rint() const { return map(vml_std::rint); } - realvec_t round() const { return map(vml_std::round); } - realvec_t rsqrt() const { return sqrt().rcp(); } - boolvec_t signbit() const { return mapb(vml_std::signbit); } - realvec_t sin() const { return map(vml_std::sin); } - realvec_t sinh() const { return map(vml_std::sinh); } - realvec_t sqrt() const { return map(vml_std::sqrt); } - realvec_t tan() const { return map(vml_std::tan); } - realvec_t tanh() const { return map(vml_std::tanh); } - realvec_t trunc() const { return map(vml_std::trunc); } - }; - - - - // boolpseudovec definitions - - template<typename T, int N> - inline - typename boolpseudovec<T,N>::intvec_t boolpseudovec<T,N>::as_int() const - { - return convert_int(); - } - - template<typename T, int N> - inline - typename boolpseudovec<T,N>::intvec_t boolpseudovec<T,N>::convert_int() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d]; return res; } - - template<typename T, int N> - inline - typename boolpseudovec<T,N>::boolvec_t - boolpseudovec<T,N>::ifthen(boolvec_t x, boolvec_t y) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d]; - return res; - } - - template<typename T, int N> - inline - typename boolpseudovec<T,N>::intvec_t - boolpseudovec<T,N>::ifthen(intvec_t x, intvec_t y) const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d]; + boolvec_t isfinite() const { return mapb(vml_std::isfinite); } + boolvec_t isinf() const { return mapb(vml_std::isinf); } + boolvec_t isnan() const { return mapb(vml_std::isnan); } + boolvec_t isnormal() const { return mapb(vml_std::isnormal); } + realvec_t ldexp(int_t n) const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = vml_std::ldexp(v[d], n); return res; } - - template<typename T, int N> - inline - typename boolpseudovec<T,N>::realvec_t - boolpseudovec<T,N>::ifthen(realvec_t x, realvec_t y) const - { + realvec_t ldexp(intvec_t n) const { realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d]; + for (int d = 0; d < size; ++d) + res.v[d] = vml_std::ldexp(v[d], n.v[d]); return res; } - - - - // intpseudovec definitions - - template<typename T, int N> - inline - typename intpseudovec<T,N>::realvec_t intpseudovec<T,N>::as_float() const - { + realvec_t log() const { return map(vml_std::log); } + realvec_t log10() const { return map(vml_std::log10); } + realvec_t log1p() const { return map(vml_std::log1p); } + realvec_t log2() const { return map(vml_std::log2); } + intvec_t lrint() const { realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = FP::as_float(v[d]); + if (sizeof(int_t) <= sizeof(long)) { + for (int d = 0; d < size; ++d) + res.v[d] = vml_std::lrint(v[d]); + } else if (sizeof(int_t) <= sizeof(long long)) { + for (int d = 0; d < size; ++d) + res.v[d] = vml_std::llrint(v[d]); + } else { + __builtin_unreachable(); + } return res; } - - template<typename T, int N> - inline - intpseudovec<T,N> intpseudovec<T,N>::bitifthen(intvec_t x, intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - template<typename T, int N> - inline - typename intpseudovec<T,N>::realvec_t intpseudovec<T,N>::convert_float() const - { + realvec_t mad(realvec_t y, realvec_t z) const { + return MF::vml_mad(*this, y, z); + } + realvec_t nextafter(realvec_t y) const { return map(vml_std::nextafter, y); } + realvec_t pow(realvec_t y) const { return map(vml_std::pow, y); } + realvec_t rcp() const { realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = FP::convert_float(v[d]); + for (int d = 0; d < size; ++d) + res.v[d] = R(1.0) / v[d]; return res; } - - template<typename T, int N> - inline intpseudovec<T,N> intpseudovec<T,N>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); - } - - template<typename T, int N> - inline intpseudovec<T,N> intpseudovec<T,N>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); - } - - - - // Wrappers - - // boolpseudovec wrappers - - template<typename real_t, int size> - inline intpseudovec<real_t, size> as_int(boolpseudovec<real_t, size> x) - { - return x.as_int(); - } - - template<typename real_t, int size> - inline intpseudovec<real_t, size> convert_int(boolpseudovec<real_t, size> x) - { - return x.convert_int(); - } - - template<typename real_t, int size> - inline bool all(boolpseudovec<real_t, size> x) { return x.all(); } - - template<typename real_t, int size> - inline bool any(boolpseudovec<real_t, size> x) { return x.any(); } - - template<typename real_t, int size> - inline - boolpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c, - boolpseudovec<real_t, size> x, - boolpseudovec<real_t, size> y) - { - return c.ifthen(x, y); - } - - template<typename real_t, int size> - inline - intpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c, - intpseudovec<real_t, size> x, - intpseudovec<real_t, size> y) - { - return c.ifthen(x, y); - } - - template<typename real_t, int size> - inline - realpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c, - realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y) - { - return c.ifthen(x, y); - } - - - - // intpseudovec wrappers - - template<typename real_t, int size> - inline intpseudovec<real_t, size> abs(intpseudovec<real_t, size> x) - { - return x.abs(); - } - - template<typename real_t, int size> - inline boolpseudovec<real_t, size> as_bool(intpseudovec<real_t, size> x) - { - return x.as_bool(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> as_float(intpseudovec<real_t, size> x) - { - return x.as_float(); - } - - template<typename real_t, int size> - inline intpseudovec<real_t, size> bitifthen(intpseudovec<real_t, size> x, - intpseudovec<real_t, size> y, - intpseudovec<real_t, size> z) - { - return x.bitifthen(y, z); - } - - template<typename real_t, int size> - inline intpseudovec<real_t, size> clz(intpseudovec<real_t, size> x) - { - return x.clz(); - } - - template<typename real_t, int size> - inline boolpseudovec<real_t, size> convert_bool(intpseudovec<real_t, size> x) - { - return x.convert_bool(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> convert_float(intpseudovec<real_t, size> x) - { - return x.convert_float(); - } - - template<typename real_t, int size> - inline boolpseudovec<real_t, size> isignbit(intpseudovec<real_t, size> x) - { - return x.isignbit(); - } - - template<typename real_t, int size> - inline - intpseudovec<real_t, size> lsr(intpseudovec<real_t, size> x, - typename intpseudovec<real_t, size>::int_t n) - { - return x.lsr(n); - } - - template<typename real_t, int size> - inline intpseudovec<real_t, size> lsr(intpseudovec<real_t, size> x, - intpseudovec<real_t, size> n) - { - return x.lsr(n); - } - - template<typename real_t, int size> - inline intpseudovec<real_t, size> max(intpseudovec<real_t, size> x, - intpseudovec<real_t, size> y) - { - return x.max(y); - } - - template<typename real_t, int size> - inline intpseudovec<real_t, size> min(intpseudovec<real_t, size> x, - intpseudovec<real_t, size> y) - { - return x.min(y); - } - - template<typename real_t, int size> - inline intpseudovec<real_t, size> popcount(intpseudovec<real_t, size> x) - { - return x.popcount(); - } - - template<typename real_t, int size> - inline - intpseudovec<real_t, size> rotate(intpseudovec<real_t, size> x, - typename - intpseudovec<real_t, size>::int_t n) - { - return x.rotate(n); - } - - template<typename real_t, int size> - inline intpseudovec<real_t, size> rotate(intpseudovec<real_t, size> x, - intpseudovec<real_t, size> n) - { - return x.rotate(n); - } - - - - // realpseudovec wrappers - - template<typename real_t, int size> - inline realpseudovec<real_t, size> - loada(real_t const* p, - realpseudovec<real_t, size> x, - typename realpseudovec<real_t, size>::mask_t const& m) - { - return x.loada(p, m); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> - loadu(real_t const* p, - realpseudovec<real_t, size> x, - typename realpseudovec<real_t, size>::mask_t const& m) - { - return x.loadu(p, m); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> - loadu(real_t const* p, size_t ioff, - realpseudovec<real_t, size> x, - typename realpseudovec<real_t, size>::mask_t const& m) - { - return x.loadu(p, ioff, m); - } - - template<typename real_t, int size> - inline void storea(realpseudovec<real_t, size> x, real_t* p) - { - return x.storea(p); - } - - template<typename real_t, int size> - inline void storeu(realpseudovec<real_t, size> x, real_t* p) - { - return x.storeu(p); - } - - template<typename real_t, int size> - inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff) - { - return x.storeu(p, ioff); - } - - template<typename real_t, int size> - inline void storea(realpseudovec<real_t, size> x, real_t* p, - typename realpseudovec<real_t, size>::mask_t const& m) - { - return x.storea(p, m); - } - - template<typename real_t, int size> - inline void storeu(realpseudovec<real_t, size> x, real_t* p, - typename realpseudovec<real_t, size>::mask_t const& m) - { - return x.storeu(p, m); - } - - template<typename real_t, int size> - inline void storeu(realpseudovec<real_t, size> x, real_t* p, size_t ioff, - typename realpseudovec<real_t, size>::mask_t const& m) - { - return x.storeu(p, ioff, m); - } - - - - template<typename real_t, int size> - inline intpseudovec<real_t, size> as_int(realpseudovec<real_t, size> x) - { - return x.as_int(); - } - - template<typename real_t, int size> - inline intpseudovec<real_t, size> convert_int(realpseudovec<real_t, size> x) - { - return x.convert_int(); - } - - template<typename real_t, int size> - inline real_t maxval(realpseudovec<real_t, size> x) - { - return x.maxval(); - } - - template<typename real_t, int size> - inline real_t minval(realpseudovec<real_t, size> x) - { - return x.minval(); - } - - template<typename real_t, int size> - inline real_t prod(realpseudovec<real_t, size> x) - { - return x.prod(); - } - - template<typename real_t, int size> - inline real_t sum(realpseudovec<real_t, size> x) - { - return x.sum(); - } - - - - template<typename real_t, int size> - inline realpseudovec<real_t, size> acos(realpseudovec<real_t, size> x) - { - return x.acos(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> acosh(realpseudovec<real_t, size> x) - { - return x.acosh(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> asin(realpseudovec<real_t, size> x) - { - return x.asin(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> asinh(realpseudovec<real_t, size> x) - { - return x.asinh(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> atan(realpseudovec<real_t, size> x) - { - return x.atan(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> atan2(realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y) - { - return x.atan2(y); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> atanh(realpseudovec<real_t, size> x) - { - return x.atanh(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> cbrt(realpseudovec<real_t, size> x) - { - return x.cbrt(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> ceil(realpseudovec<real_t, size> x) - { - return x.ceil(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> copysign(realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y) - { - return x.copysign(y); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> cos(realpseudovec<real_t, size> x) - { - return x.cos(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> cosh(realpseudovec<real_t, size> x) - { - return x.cosh(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> exp(realpseudovec<real_t, size> x) - { - return x.exp(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> exp10(realpseudovec<real_t, size> x) - { - return x.exp10(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> exp2(realpseudovec<real_t, size> x) - { - return x.exp2(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> expm1(realpseudovec<real_t, size> x) - { - return x.expm1(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> fabs(realpseudovec<real_t, size> x) - { - return x.fabs(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> floor(realpseudovec<real_t, size> x) - { - return x.floor(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> fdim(realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y) - { - return x.fdim(y); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> fma(realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y, - realpseudovec<real_t, size> z) - { - return x.fma(y, z); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> fmax(realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y) - { - return x.fmax(y); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> fmin(realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y) - { - return x.fmin(y); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> fmod(realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y) - { - return x.fmod(y); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> frexp(realpseudovec<real_t, size> x, - intpseudovec<real_t, size>* r) - { - return x.frexp(r); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> hypot(realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y) - { - return x.hypot(y); - } - - template<typename real_t, int size> - inline intpseudovec<real_t, size> ilogb(realpseudovec<real_t, size> x) - { - return x.ilogb(); - } - - template<typename real_t, int size> - inline boolpseudovec<real_t, size> isfinite(realpseudovec<real_t, size> x) - { - return x.isfinite(); - } - - template<typename real_t, int size> - inline boolpseudovec<real_t, size> isinf(realpseudovec<real_t, size> x) - { - return x.isinf(); - } - - template<typename real_t, int size> - inline boolpseudovec<real_t, size> isnan(realpseudovec<real_t, size> x) - { - return x.isnan(); - } - - template<typename real_t, int size> - inline boolpseudovec<real_t, size> isnormal(realpseudovec<real_t, size> x) - { - return x.isnormal(); - } - - template<typename real_t, int size> - inline - realpseudovec<real_t, size> ldexp(realpseudovec<real_t, size> x, - typename intpseudovec<real_t, size>::int_t - n) - { - return x.ldexp(n); - } - - template<typename real_t, int size> - inline - realpseudovec<real_t, size> ldexp(realpseudovec<real_t, size> x, - intpseudovec<real_t, size> n) - { - return x.ldexp(n); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> log(realpseudovec<real_t, size> x) - { - return x.log(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> log10(realpseudovec<real_t, size> x) - { - return x.log10(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> log1p(realpseudovec<real_t, size> x) - { - return x.log1p(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> log2(realpseudovec<real_t, size> x) - { - return x.log2(); - } - - template<typename real_t, int size> - inline intpseudovec<real_t, size> lrint(realpseudovec<real_t, size> x) - { - return x.lrint(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> mad(realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y, - realpseudovec<real_t, size> z) - { - return x.mad(y, z); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> nextafter(realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y) - { - return x.nextafter(y); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> pow(realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y) - { - return x.pow(y); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> rcp(realpseudovec<real_t, size> x) - { - return x.rcp(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> remainder(realpseudovec<real_t, size> x, - realpseudovec<real_t, size> y) - { - return x.remainder(y); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> rint(realpseudovec<real_t, size> x) - { - return x.rint(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> round(realpseudovec<real_t, size> x) - { - return x.round(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> rsqrt(realpseudovec<real_t, size> x) - { - return x.rsqrt(); - } - - template<typename real_t, int size> - inline boolpseudovec<real_t, size> signbit(realpseudovec<real_t, size> x) - { - return x.signbit(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> sin(realpseudovec<real_t, size> x) - { - return x.sin(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> sinh(realpseudovec<real_t, size> x) - { - return x.sinh(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> sqrt(realpseudovec<real_t, size> x) - { - return x.sqrt(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> tan(realpseudovec<real_t, size> x) - { - return x.tan(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> tanh(realpseudovec<real_t, size> x) - { - return x.tanh(); - } - - template<typename real_t, int size> - inline realpseudovec<real_t, size> trunc(realpseudovec<real_t, size> x) - { - return x.trunc(); - } - - - + realvec_t remainder(realvec_t y) const { return map(vml_std::remainder, y); } + realvec_t rint() const { return map(vml_std::rint); } + realvec_t round() const { return map(vml_std::round); } + realvec_t rsqrt() const { return sqrt().rcp(); } + boolvec_t signbit() const { return mapb(vml_std::signbit); } + realvec_t sin() const { return map(vml_std::sin); } + realvec_t sinh() const { return map(vml_std::sinh); } + realvec_t sqrt() const { return map(vml_std::sqrt); } + realvec_t tan() const { return map(vml_std::tan); } + realvec_t tanh() const { return map(vml_std::tanh); } + realvec_t trunc() const { return map(vml_std::trunc); } +}; + +// boolpseudovec definitions + +template <typename T, int N> +inline typename boolpseudovec<T, N>::intvec_t +boolpseudovec<T, N>::as_int() const { + return convert_int(); +} + +template <typename T, int N> +inline typename boolpseudovec<T, N>::intvec_t +boolpseudovec<T, N>::convert_int() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d]; + return res; +} + +template <typename T, int N> +inline typename boolpseudovec<T, N>::boolvec_t +boolpseudovec<T, N>::ifthen(boolvec_t x, boolvec_t y) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] ? x.v[d] : y.v[d]; + return res; +} + +template <typename T, int N> +inline typename boolpseudovec<T, N>::intvec_t +boolpseudovec<T, N>::ifthen(intvec_t x, intvec_t y) const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] ? x.v[d] : y.v[d]; + return res; +} + +template <typename T, int N> +inline typename boolpseudovec<T, N>::realvec_t +boolpseudovec<T, N>::ifthen(realvec_t x, realvec_t y) const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] ? x.v[d] : y.v[d]; + return res; +} + +// intpseudovec definitions + +template <typename T, int N> +inline typename intpseudovec<T, N>::realvec_t +intpseudovec<T, N>::as_float() const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = FP::as_float(v[d]); + return res; +} + +template <typename T, int N> +inline intpseudovec<T, N> intpseudovec<T, N>::bitifthen(intvec_t x, + intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); +} + +template <typename T, int N> +inline typename intpseudovec<T, N>::realvec_t +intpseudovec<T, N>::convert_float() const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = FP::convert_float(v[d]); + return res; +} + +template <typename T, int N> +inline intpseudovec<T, N> intpseudovec<T, N>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +template <typename T, int N> +inline intpseudovec<T, N> intpseudovec<T, N>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + +// Wrappers + +// boolpseudovec wrappers + +template <typename real_t, int size> +inline intpseudovec<real_t, size> as_int(boolpseudovec<real_t, size> x) { + return x.as_int(); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> convert_int(boolpseudovec<real_t, size> x) { + return x.convert_int(); +} + +template <typename real_t, int size> +inline bool all(boolpseudovec<real_t, size> x) { + return x.all(); +} + +template <typename real_t, int size> +inline bool any(boolpseudovec<real_t, size> x) { + return x.any(); +} + +template <typename real_t, int size> +inline boolpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c, + boolpseudovec<real_t, size> x, + boolpseudovec<real_t, size> y) { + return c.ifthen(x, y); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c, + intpseudovec<real_t, size> x, + intpseudovec<real_t, size> y) { + return c.ifthen(x, y); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> ifthen(boolpseudovec<real_t, size> c, + realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y) { + return c.ifthen(x, y); +} + +// intpseudovec wrappers + +template <typename real_t, int size> +inline intpseudovec<real_t, size> abs(intpseudovec<real_t, size> x) { + return x.abs(); +} + +template <typename real_t, int size> +inline boolpseudovec<real_t, size> as_bool(intpseudovec<real_t, size> x) { + return x.as_bool(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> as_float(intpseudovec<real_t, size> x) { + return x.as_float(); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> bitifthen(intpseudovec<real_t, size> x, + intpseudovec<real_t, size> y, + intpseudovec<real_t, size> z) { + return x.bitifthen(y, z); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> clz(intpseudovec<real_t, size> x) { + return x.clz(); +} + +template <typename real_t, int size> +inline boolpseudovec<real_t, size> convert_bool(intpseudovec<real_t, size> x) { + return x.convert_bool(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> convert_float(intpseudovec<real_t, size> x) { + return x.convert_float(); +} + +template <typename real_t, int size> +inline boolpseudovec<real_t, size> isignbit(intpseudovec<real_t, size> x) { + return x.isignbit(); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> +lsr(intpseudovec<real_t, size> x, + typename intpseudovec<real_t, size>::int_t n) { + return x.lsr(n); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> lsr(intpseudovec<real_t, size> x, + intpseudovec<real_t, size> n) { + return x.lsr(n); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> max(intpseudovec<real_t, size> x, + intpseudovec<real_t, size> y) { + return x.max(y); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> min(intpseudovec<real_t, size> x, + intpseudovec<real_t, size> y) { + return x.min(y); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> popcount(intpseudovec<real_t, size> x) { + return x.popcount(); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> +rotate(intpseudovec<real_t, size> x, + typename intpseudovec<real_t, size>::int_t n) { + return x.rotate(n); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> rotate(intpseudovec<real_t, size> x, + intpseudovec<real_t, size> n) { + return x.rotate(n); +} + +// realpseudovec wrappers + +template <typename real_t, int size> +inline realpseudovec<real_t, size> +loada(real_t const *p, realpseudovec<real_t, size> x, + typename realpseudovec<real_t, size>::mask_t const &m) { + return x.loada(p, m); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> +loadu(real_t const *p, realpseudovec<real_t, size> x, + typename realpseudovec<real_t, size>::mask_t const &m) { + return x.loadu(p, m); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> +loadu(real_t const *p, size_t ioff, realpseudovec<real_t, size> x, + typename realpseudovec<real_t, size>::mask_t const &m) { + return x.loadu(p, ioff, m); +} + +template <typename real_t, int size> +inline void storea(realpseudovec<real_t, size> x, real_t *p) { + return x.storea(p); +} + +template <typename real_t, int size> +inline void storeu(realpseudovec<real_t, size> x, real_t *p) { + return x.storeu(p); +} + +template <typename real_t, int size> +inline void storeu(realpseudovec<real_t, size> x, real_t *p, size_t ioff) { + return x.storeu(p, ioff); +} + +template <typename real_t, int size> +inline void storea(realpseudovec<real_t, size> x, real_t *p, + typename realpseudovec<real_t, size>::mask_t const &m) { + return x.storea(p, m); +} + +template <typename real_t, int size> +inline void storeu(realpseudovec<real_t, size> x, real_t *p, + typename realpseudovec<real_t, size>::mask_t const &m) { + return x.storeu(p, m); +} + +template <typename real_t, int size> +inline void storeu(realpseudovec<real_t, size> x, real_t *p, size_t ioff, + typename realpseudovec<real_t, size>::mask_t const &m) { + return x.storeu(p, ioff, m); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> as_int(realpseudovec<real_t, size> x) { + return x.as_int(); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> convert_int(realpseudovec<real_t, size> x) { + return x.convert_int(); +} + +template <typename real_t, int size> +inline real_t maxval(realpseudovec<real_t, size> x) { + return x.maxval(); +} + +template <typename real_t, int size> +inline real_t minval(realpseudovec<real_t, size> x) { + return x.minval(); +} + +template <typename real_t, int size> +inline real_t prod(realpseudovec<real_t, size> x) { + return x.prod(); +} + +template <typename real_t, int size> +inline real_t sum(realpseudovec<real_t, size> x) { + return x.sum(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> acos(realpseudovec<real_t, size> x) { + return x.acos(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> acosh(realpseudovec<real_t, size> x) { + return x.acosh(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> asin(realpseudovec<real_t, size> x) { + return x.asin(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> asinh(realpseudovec<real_t, size> x) { + return x.asinh(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> atan(realpseudovec<real_t, size> x) { + return x.atan(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> atan2(realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y) { + return x.atan2(y); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> atanh(realpseudovec<real_t, size> x) { + return x.atanh(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> cbrt(realpseudovec<real_t, size> x) { + return x.cbrt(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> ceil(realpseudovec<real_t, size> x) { + return x.ceil(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> copysign(realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y) { + return x.copysign(y); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> cos(realpseudovec<real_t, size> x) { + return x.cos(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> cosh(realpseudovec<real_t, size> x) { + return x.cosh(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> exp(realpseudovec<real_t, size> x) { + return x.exp(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> exp10(realpseudovec<real_t, size> x) { + return x.exp10(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> exp2(realpseudovec<real_t, size> x) { + return x.exp2(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> expm1(realpseudovec<real_t, size> x) { + return x.expm1(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> fabs(realpseudovec<real_t, size> x) { + return x.fabs(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> floor(realpseudovec<real_t, size> x) { + return x.floor(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> fdim(realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y) { + return x.fdim(y); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> fma(realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y, + realpseudovec<real_t, size> z) { + return x.fma(y, z); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> fmax(realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y) { + return x.fmax(y); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> fmin(realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y) { + return x.fmin(y); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> fmod(realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y) { + return x.fmod(y); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> frexp(realpseudovec<real_t, size> x, + intpseudovec<real_t, size> *r) { + return x.frexp(r); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> hypot(realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y) { + return x.hypot(y); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> ilogb(realpseudovec<real_t, size> x) { + return x.ilogb(); +} + +template <typename real_t, int size> +inline boolpseudovec<real_t, size> isfinite(realpseudovec<real_t, size> x) { + return x.isfinite(); +} + +template <typename real_t, int size> +inline boolpseudovec<real_t, size> isinf(realpseudovec<real_t, size> x) { + return x.isinf(); +} + +template <typename real_t, int size> +inline boolpseudovec<real_t, size> isnan(realpseudovec<real_t, size> x) { + return x.isnan(); +} + +template <typename real_t, int size> +inline boolpseudovec<real_t, size> isnormal(realpseudovec<real_t, size> x) { + return x.isnormal(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> +ldexp(realpseudovec<real_t, size> x, + typename intpseudovec<real_t, size>::int_t n) { + return x.ldexp(n); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> ldexp(realpseudovec<real_t, size> x, + intpseudovec<real_t, size> n) { + return x.ldexp(n); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> log(realpseudovec<real_t, size> x) { + return x.log(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> log10(realpseudovec<real_t, size> x) { + return x.log10(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> log1p(realpseudovec<real_t, size> x) { + return x.log1p(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> log2(realpseudovec<real_t, size> x) { + return x.log2(); +} + +template <typename real_t, int size> +inline intpseudovec<real_t, size> lrint(realpseudovec<real_t, size> x) { + return x.lrint(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> mad(realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y, + realpseudovec<real_t, size> z) { + return x.mad(y, z); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> nextafter(realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y) { + return x.nextafter(y); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> pow(realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y) { + return x.pow(y); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> rcp(realpseudovec<real_t, size> x) { + return x.rcp(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> remainder(realpseudovec<real_t, size> x, + realpseudovec<real_t, size> y) { + return x.remainder(y); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> rint(realpseudovec<real_t, size> x) { + return x.rint(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> round(realpseudovec<real_t, size> x) { + return x.round(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> rsqrt(realpseudovec<real_t, size> x) { + return x.rsqrt(); +} + +template <typename real_t, int size> +inline boolpseudovec<real_t, size> signbit(realpseudovec<real_t, size> x) { + return x.signbit(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> sin(realpseudovec<real_t, size> x) { + return x.sin(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> sinh(realpseudovec<real_t, size> x) { + return x.sinh(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> sqrt(realpseudovec<real_t, size> x) { + return x.sqrt(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> tan(realpseudovec<real_t, size> x) { + return x.tan(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> tanh(realpseudovec<real_t, size> x) { + return x.tanh(); +} + +template <typename real_t, int size> +inline realpseudovec<real_t, size> trunc(realpseudovec<real_t, size> x) { + return x.trunc(); +} + #ifndef VML_NO_IOSTREAM - template<typename real_t, int size> - std::ostream& operator<<(std::ostream& os, - boolpseudovec<real_t, size> const& x) - { - os << "["; - for (int i=0; i<size; ++i) { - if (i!=0) os << ","; - os << x[i]; - } - os << "]"; - return os; - } - - template<typename real_t, int size> - std::ostream& operator<<(std::ostream& os, - intpseudovec<real_t, size> const& x) - { - os << "["; - for (int i=0; i<size; ++i) { - if (i!=0) os << ","; - os << x[i]; - } - os << "]"; - return os; - } - - template<typename real_t, int size> - std::ostream& operator<<(std::ostream& os, - realpseudovec<real_t, size> const& x) - { - os << "["; - for (int i=0; i<size; ++i) { - if (i!=0) os << ","; - os << x[i]; - } - os << "]"; - return os; - } +template <typename real_t, int size> +std::ostream &operator<<(std::ostream &os, + boolpseudovec<real_t, size> const &x) { + os << "["; + for (int i = 0; i < size; ++i) { + if (i != 0) + os << ","; + os << x[i]; + } + os << "]"; + return os; +} + +template <typename real_t, int size> +std::ostream &operator<<(std::ostream &os, + intpseudovec<real_t, size> const &x) { + os << "["; + for (int i = 0; i < size; ++i) { + if (i != 0) + os << ","; + os << x[i]; + } + os << "]"; + return os; +} + +template <typename real_t, int size> +std::ostream &operator<<(std::ostream &os, + realpseudovec<real_t, size> const &x) { + os << "["; + for (int i = 0; i < size; ++i) { + if (i != 0) + os << ","; + os << x[i]; + } + os << "]"; + return os; +} #endif - + } // namespace vecmathlib -#endif // #ifndef VEC_PSEUDO_H +#endif // #ifndef VEC_PSEUDO_H diff --git a/vec_qpx_double4.h b/vec_qpx_double4.h index 9fa6bd0..b88b0da 100644 --- a/vec_qpx_double4.h +++ b/vec_qpx_double4.h @@ -11,785 +11,662 @@ // QPX intrinsics #ifdef __clang__ -# include <qpxintrin.h> +#include <qpxintrin.h> #else -# include <builtins.h> +#include <builtins.h> #endif #include <mass_simd.h> - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_DOUBLE_4 - template<> struct boolvec<double,4>; - template<> struct intvec<double,4>; - template<> struct realvec<double,4>; - - - - template<> - struct boolvec<double,4>: floatprops<double> - { - static int const size = 4; - typedef bool scalar_t; - typedef vector4double bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // canonical true is +1.0, canonical false is -1.0 - // >=0 is true, -0 is true, nan is false - static real_t from_bool(bool a) { return a ? +1.0 : -1.0; } - static bool to_bool(real_t a) { return a>=0.0; } - public: - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(vec_splats(from_bool(a))) {} - boolvec(const bool* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - - operator bvector_t() const { return v; } - bool operator[](int n) const - { - return to_bool(v[n]); - } - boolvec& set_elt(int n, bool a) - { - return v[n]=from_bool(a), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return vec_not(v); } - - boolvec operator&&(boolvec x) const { return vec_and(v, x.v); } - boolvec operator||(boolvec x) const { return vec_or(v, x.v); } - boolvec operator==(boolvec x) const - { - return vec_logical(v, x.v, 0x9); - } - boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); } - - bool all() const - { - // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3]; - boolvec x0123 = *this; - boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032)); - boolvec y0022 = x0123 && x1032; - return y0022[0] && y0022[2]; - } - bool any() const - { - // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3]; - boolvec x0123 = *this; - boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032)); - boolvec y0022 = x0123 || x1032; - return y0022[0] || y0022[2]; - } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<double,4>: floatprops<double> - { - static int const size = 4; - typedef int_t scalar_t; - typedef vector4double ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(const intvec& x): v(x.v) {} - // intvec& operator=(const intvec& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(vec_splats(FP::as_float(a))) {} - intvec(const int_t* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - static intvec iota() - { - const int_t iota_[] = {0, 1, 2, 3}; - return intvec(iota_); - } - - operator ivector_t() const { return v; } - int_t operator[](int n) const - { - return FP::as_int(v[n]); - } - intvec& set_elt(int n, int_t a) - { - return v[n]=FP::as_float(a), *this; - } - - - - // Vector casts do not change the bit battern - boolvec_t as_bool() const { return v; } - boolvec_t convert_bool() const { return *this != IV(I(0)); } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - intvec operator+() const { return *this; } - intvec operator-() const - { - intvec r; - for (int d=0; d<size; ++d) r.set_elt(d, -(*this)[d]); - return r; - } - - intvec operator+(intvec x) const - { - intvec r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] + x[d]); - return r; - } - intvec operator-(intvec x) const - { - intvec r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] - x[d]); - return r; - } - - intvec& operator+=(intvec x) { return *this=*this+x; } - intvec& operator-=(intvec x) { return *this=*this-x; } - - - - intvec operator~() const - { - intvec r; - for (int d=0; d<size; ++d) r.set_elt(d, ~(*this)[d]); - return r; - } - - intvec operator&(intvec x) const - { - intvec r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] & x[d]); - return r; - } - intvec operator|(intvec x) const - { - intvec r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] | x[d]); - return r; - } - intvec operator^(intvec x) const - { - intvec r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] ^ x[d]); - return r; - } - - intvec& operator&=(intvec x) { return *this=*this&x; } - intvec& operator|=(intvec x) { return *this=*this|x; } - intvec& operator^=(intvec x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const; - - - - intvec_t lsr(int_t n) const - { - intvec_t r; - for (int d=0; d<size; ++d) r.set_elt(d, U((*this)[d]) >> U(n)); - return r; - } - intvec_t rotate(int_t n) const; - intvec operator>>(int_t n) const - { - intvec r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >> n); - return r; - } - intvec operator<<(int_t n) const - { - intvec r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] << n); - return r; - } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<<n; } - - intvec_t lsr(intvec_t n) const - { - intvec_t r; - for (int d=0; d<size; ++d) r.set_elt(d, U((*this)[d]) >> U(n[d])); - return r; - } - intvec_t rotate(intvec_t n) const; - intvec operator>>(intvec n) const - { - intvec r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >> n[d]); - return r; - } - intvec operator<<(intvec n) const - { - intvec r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] << n[d]); - return r; - } - intvec& operator>>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<<n; } - - intvec_t clz() const; - intvec_t popcount() const; - - - - boolvec_t operator==(intvec x) const - { - boolvec_t r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] == x[d]); - return r; - } - boolvec_t operator!=(intvec x) const - { - boolvec_t r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] != x[d]); - return r; - } - boolvec_t operator<(intvec x) const - { - boolvec_t r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] < x[d]); - return r; - } - boolvec_t operator<=(intvec x) const - { - boolvec_t r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] <= x[d]); - return r; - } - boolvec_t operator>(intvec x) const - { - boolvec_t r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] > x[d]); - return r; - } - boolvec_t operator>=(intvec x) const - { - boolvec_t r; - for (int d=0; d<size; ++d) r.set_elt(d, (*this)[d] >= x[d]); - return r; - } - - intvec_t abs() const; - boolvec_t isignbit() const; - intvec_t max(intvec_t x) const; - intvec_t min(intvec_t x) const; - }; - - - - template<> - struct realvec<double,4>: floatprops<double> - { - static int const size = 4; - typedef real_t scalar_t; - typedef vector4double vector_t; - static int const alignment = sizeof(vector_t); - - static const char* name() { return "<QPX:4*double>"; } - void barrier() { __asm__("": "+v"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(const realvec& x): v(x.v) {} - // realvec& operator=(const realvec& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(vec_splats(a)) {} - realvec(const real_t* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - - operator vector_t() const { return v; } - real_t operator[](int n) const - { - return v[n]; - } - realvec& set_elt(int n, real_t a) - { - return v[n]=a, *this; - } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(const real_t* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return vec_lda(0, (real_t*)p); - } - static realvec_t loadu(const real_t* p) - { - realvec_t v0 = vec_ld(0, (real_t*)p); - realvec_t v1 = vec_ld(31, (real_t*)p); - return vec_perm(v0.v, v1.v, vec_lvsl(0, (real_t*)p)); - } - static realvec_t loadu(const real_t* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - // TODO: use load instruction with fixed offset - return loadu(p+ioff); - } - realvec_t loada(const real_t* p, mask_t m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(const real_t* p, mask_t m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(const real_t* p, std::ptrdiff_t ioff, mask_t m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - // TODO: use load instruction with fixed offset - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - vec_sta(v, 0, p); - } - void storeu(real_t* p) const - { - // Vector stores would require vector loads, which would need to - // be atomic - // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas - p[0] = (*this)[0]; - p[1] = (*this)[1]; - p[2] = (*this)[2]; - p[3] = (*this)[3]; - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } - } - void storeu(real_t* p, mask_t m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return v; } - intvec_t convert_int() const { return vec_ctidz(v); } - - - - realvec operator+() const { return *this; } - realvec operator-() const { return vec_neg(v); } - - realvec operator+(realvec x) const { return vec_add(v, x.v); } - realvec operator-(realvec x) const { return vec_sub(v, x.v); } - realvec operator*(realvec x) const { return vec_mul(v, x.v); } - realvec operator/(realvec x) const - { - // return vec_swdiv_nochk(v, x.v); - return div_fastd4(v, x.v); - } - - realvec& operator+=(realvec x) { return *this=*this+x; } - realvec& operator-=(realvec x) { return *this=*this-x; } - realvec& operator*=(realvec x) { return *this=*this*x; } - realvec& operator/=(realvec x) { return *this=*this/x; } - - real_t maxval() const - { - // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]), - // vml_std::fmax((*this)[2], (*this)[3])); - realvec_t x0123 = *this; - realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032)); - realvec_t y0022 = x0123.fmax(x1032); - return vml_std::fmax(y0022[0], y0022[2]); +template <> struct boolvec<double, 4>; +template <> struct intvec<double, 4>; +template <> struct realvec<double, 4>; + +template <> struct boolvec<double, 4> : floatprops<double> { + static int const size = 4; + typedef bool scalar_t; + typedef vector4double bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + +private: + // canonical true is +1.0, canonical false is -1.0 + // >=0 is true, -0 is true, nan is false + static real_t from_bool(bool a) { return a ? +1.0 : -1.0; } + static bool to_bool(real_t a) { return a >= 0.0; } + +public: + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v(vec_splats(from_bool(a))) {} + boolvec(const bool *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + + operator bvector_t() const { return v; } + bool operator[](int n) const { return to_bool(v[n]); } + boolvec &set_elt(int n, bool a) { return v[n] = from_bool(a), *this; } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec operator!() const { return vec_not(v); } + + boolvec operator&&(boolvec x) const { return vec_and(v, x.v); } + boolvec operator||(boolvec x) const { return vec_or(v, x.v); } + boolvec operator==(boolvec x) const { return vec_logical(v, x.v, 0x9); } + boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); } + + bool all() const { + // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3]; + boolvec x0123 = *this; + boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032)); + boolvec y0022 = x0123 && x1032; + return y0022[0] && y0022[2]; + } + bool any() const { + // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3]; + boolvec x0123 = *this; + boolvec x1032 = vec_perm(x0123, x0123, vec_gpci(01032)); + boolvec y0022 = x0123 || x1032; + return y0022[0] || y0022[2]; + } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<double, 4> : floatprops<double> { + static int const size = 4; + typedef int_t scalar_t; + typedef vector4double ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(const intvec& x): v(x.v) {} + // intvec& operator=(const intvec& x) { return v=x.v, *this; } + intvec(ivector_t x) : v(x) {} + intvec(int_t a) : v(vec_splats(FP::as_float(a))) {} + intvec(const int_t *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + static intvec iota() { + const int_t iota_[] = {0, 1, 2, 3}; + return intvec(iota_); + } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return FP::as_int(v[n]); } + intvec &set_elt(int n, int_t a) { return v[n] = FP::as_float(a), *this; } + + // Vector casts do not change the bit battern + boolvec_t as_bool() const { return v; } + boolvec_t convert_bool() const { return *this != IV(I(0)); } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + intvec operator+() const { return *this; } + intvec operator-() const { + intvec r; + for (int d = 0; d < size; ++d) + r.set_elt(d, -(*this)[d]); + return r; + } + + intvec operator+(intvec x) const { + intvec r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] + x[d]); + return r; + } + intvec operator-(intvec x) const { + intvec r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] - x[d]); + return r; + } + + intvec &operator+=(intvec x) { return *this = *this + x; } + intvec &operator-=(intvec x) { return *this = *this - x; } + + intvec operator~() const { + intvec r; + for (int d = 0; d < size; ++d) + r.set_elt(d, ~(*this)[d]); + return r; + } + + intvec operator&(intvec x) const { + intvec r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] & x[d]); + return r; + } + intvec operator|(intvec x) const { + intvec r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] | x[d]); + return r; + } + intvec operator^(intvec x) const { + intvec r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] ^ x[d]); + return r; + } + + intvec &operator&=(intvec x) { return *this = *this & x; } + intvec &operator|=(intvec x) { return *this = *this | x; } + intvec &operator^=(intvec x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const; + + intvec_t lsr(int_t n) const { + intvec_t r; + for (int d = 0; d < size; ++d) + r.set_elt(d, U((*this)[d]) >> U(n)); + return r; + } + intvec_t rotate(int_t n) const; + intvec operator>>(int_t n) const { + intvec r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] >> n); + return r; + } + intvec operator<<(int_t n) const { + intvec r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] << n); + return r; + } + intvec &operator>>=(int_t n) { return *this = *this >> n; } + intvec &operator<<=(int_t n) { return *this = *this << n; } + + intvec_t lsr(intvec_t n) const { + intvec_t r; + for (int d = 0; d < size; ++d) + r.set_elt(d, U((*this)[d]) >> U(n[d])); + return r; + } + intvec_t rotate(intvec_t n) const; + intvec operator>>(intvec n) const { + intvec r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] >> n[d]); + return r; + } + intvec operator<<(intvec n) const { + intvec r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] << n[d]); + return r; + } + intvec &operator>>=(intvec n) { return *this = *this >> n; } + intvec &operator<<=(intvec n) { return *this = *this << n; } + + intvec_t clz() const; + intvec_t popcount() const; + + boolvec_t operator==(intvec x) const { + boolvec_t r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] == x[d]); + return r; + } + boolvec_t operator!=(intvec x) const { + boolvec_t r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] != x[d]); + return r; + } + boolvec_t operator<(intvec x) const { + boolvec_t r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] < x[d]); + return r; + } + boolvec_t operator<=(intvec x) const { + boolvec_t r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] <= x[d]); + return r; + } + boolvec_t operator>(intvec x) const { + boolvec_t r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] > x[d]); + return r; + } + boolvec_t operator>=(intvec x) const { + boolvec_t r; + for (int d = 0; d < size; ++d) + r.set_elt(d, (*this)[d] >= x[d]); + return r; + } + + intvec_t abs() const; + boolvec_t isignbit() const; + intvec_t max(intvec_t x) const; + intvec_t min(intvec_t x) const; +}; + +template <> struct realvec<double, 4> : floatprops<double> { + static int const size = 4; + typedef real_t scalar_t; + typedef vector4double vector_t; + static int const alignment = sizeof(vector_t); + + static const char *name() { return "<QPX:4*double>"; } + void barrier() { __asm__("" : "+v"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(const realvec& x): v(x.v) {} + // realvec& operator=(const realvec& x) { return v=x.v, *this; } + realvec(vector_t x) : v(x) {} + realvec(real_t a) : v(vec_splats(a)) {} + realvec(const real_t *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + + operator vector_t() const { return v; } + real_t operator[](int n) const { return v[n]; } + realvec &set_elt(int n, real_t a) { return v[n] = a, *this; } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(const real_t *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return vec_lda(0, (real_t *)p); + } + static realvec_t loadu(const real_t *p) { + realvec_t v0 = vec_ld(0, (real_t *)p); + realvec_t v1 = vec_ld(31, (real_t *)p); + return vec_perm(v0.v, v1.v, vec_lvsl(0, (real_t *)p)); + } + static realvec_t loadu(const real_t *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff); + // TODO: use load instruction with fixed offset + return loadu(p + ioff); + } + realvec_t loada(const real_t *p, mask_t m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); } - real_t minval() const - { - // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]), - // vml_std::fmin((*this)[2], (*this)[3])); - realvec_t x0123 = *this; - realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032)); - realvec_t y0022 = x0123.fmin(x1032); - return vml_std::fmin(y0022[0], y0022[2]); + } + realvec_t loadu(const real_t *p, mask_t m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); } - real_t prod() const - { - // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; - realvec_t x = vec_xmul(v, v); - return x[1] * x[3]; + } + realvec_t loadu(const real_t *p, std::ptrdiff_t ioff, mask_t m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff, m); + // TODO: use load instruction with fixed offset + return loadu(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + vec_sta(v, 0, p); + } + void storeu(real_t *p) const { + // Vector stores would require vector loads, which would need to + // be atomic + // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> + // for good ideas + p[0] = (*this)[0]; + p[1] = (*this)[1]; + p[2] = (*this)[2]; + p[3] = (*this)[3]; + } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff); + storeu(p + ioff); + } + void storea(real_t *p, mask_t m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + if (m.m[0]) + p[0] = (*this)[0]; + if (m.m[1]) + p[1] = (*this)[1]; + if (m.m[2]) + p[2] = (*this)[2]; + if (m.m[3]) + p[3] = (*this)[3]; } - real_t sum() const - { - // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; - realvec_t c1 = vec_logical(v, v, 0xf); // +1.0 - realvec_t x = vec_xxmadd(v, c1, v); - return x[0] + x[2]; + } + void storeu(real_t *p, mask_t m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) + p[0] = (*this)[0]; + if (m.m[1]) + p[1] = (*this)[1]; + if (m.m[2]) + p[2] = (*this)[2]; + if (m.m[3]) + p[3] = (*this)[3]; } - - - - boolvec_t operator==(realvec x) const { return vec_cmpeq(v, x.v); } - boolvec_t operator!=(realvec x) const { return ! (*this == x); } - boolvec_t operator<(realvec x) const { return vec_cmplt(v, x.v); } - boolvec_t operator<=(realvec x) const - { + } + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff, m); + storeu(p + ioff, m); + } + + intvec_t as_int() const { return v; } + intvec_t convert_int() const { return vec_ctidz(v); } + + realvec operator+() const { return *this; } + realvec operator-() const { return vec_neg(v); } + + realvec operator+(realvec x) const { return vec_add(v, x.v); } + realvec operator-(realvec x) const { return vec_sub(v, x.v); } + realvec operator*(realvec x) const { return vec_mul(v, x.v); } + realvec operator/(realvec x) const { + // return vec_swdiv_nochk(v, x.v); + return div_fastd4(v, x.v); + } + + realvec &operator+=(realvec x) { return *this = *this + x; } + realvec &operator-=(realvec x) { return *this = *this - x; } + realvec &operator*=(realvec x) { return *this = *this * x; } + realvec &operator/=(realvec x) { return *this = *this / x; } + + real_t maxval() const { + // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]), + // vml_std::fmax((*this)[2], (*this)[3])); + realvec_t x0123 = *this; + realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032)); + realvec_t y0022 = x0123.fmax(x1032); + return vml_std::fmax(y0022[0], y0022[2]); + } + real_t minval() const { + // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]), + // vml_std::fmin((*this)[2], (*this)[3])); + realvec_t x0123 = *this; + realvec_t x1032 = vec_perm(x0123, x0123, vec_gpci(01032)); + realvec_t y0022 = x0123.fmin(x1032); + return vml_std::fmin(y0022[0], y0022[2]); + } + real_t prod() const { + // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; + realvec_t x = vec_xmul(v, v); + return x[1] * x[3]; + } + real_t sum() const { + // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; + realvec_t c1 = vec_logical(v, v, 0xf); // +1.0 + realvec_t x = vec_xxmadd(v, c1, v); + return x[0] + x[2]; + } + + boolvec_t operator==(realvec x) const { return vec_cmpeq(v, x.v); } + boolvec_t operator!=(realvec x) const { return !(*this == x); } + boolvec_t operator<(realvec x) const { return vec_cmplt(v, x.v); } + boolvec_t operator<=(realvec x) const { #ifdef VML_HAVE_NAN - return *this < x || *this == x; + return *this < x || *this == x; #else - return ! (*this > x); + return !(*this > x); #endif - } - boolvec_t operator>(realvec x) const { return vec_cmpgt(v, x.v); } - boolvec_t operator>=(realvec x) const - { + } + boolvec_t operator>(realvec x) const { return vec_cmpgt(v, x.v); } + boolvec_t operator>=(realvec x) const { #ifdef VML_HAVE_NAN - return *this > x || *this == x; + return *this > x || *this == x; #else - return ! (*this < x); + return !(*this < x); #endif - } - - - - realvec acos() const { return acosd4(v); } - realvec acosh() const { return acoshd4(v); } - realvec asin() const { return asind4(v); } - realvec asinh() const { return asinhd4(v); } - realvec atan() const { return atand4(v); } - realvec atan2(realvec y) const { return atan2d4(v, y.v); } - realvec atanh() const { return atanhd4(v); } - realvec cbrt() const { return cbrtd4(v); } - realvec ceil() const { return vec_ceil(v); } - realvec copysign(realvec y) const { return vec_cpsgn(y.v, v); } - realvec cos() const { return cosd4(v); } - realvec cosh() const { return coshd4(v); } - realvec exp() const { return expd4(v); } - realvec exp10() const { return exp10d4(v); } - realvec exp2() const { return exp2d4(v); } - realvec expm1() const { return expm1d4(v); } - realvec fabs() const { return vec_abs(v); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const { return vec_floor(v); } - realvec fma(realvec y, realvec z) const - { - return vec_madd(v, y.v, z.v); - } - realvec fmax(realvec y) const { return MF::vml_fmax(v, y.v); } - realvec fmin(realvec y) const { return MF::vml_fmin(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); } - realvec hypot(realvec y) const { return hypotd4(v, y.v); } - intvec_t ilogb() const - { - // int_t ilogb_[] = { - // ::ilogb((*this)[0]), - // ::ilogb((*this)[1]), - // ::ilogb((*this)[2]), - // ::ilogb((*this)[3]) - // }; - // return intvec_t(ilogb_); - return MF::vml_ilogb(v); - } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const - { + } + + realvec acos() const { return acosd4(v); } + realvec acosh() const { return acoshd4(v); } + realvec asin() const { return asind4(v); } + realvec asinh() const { return asinhd4(v); } + realvec atan() const { return atand4(v); } + realvec atan2(realvec y) const { return atan2d4(v, y.v); } + realvec atanh() const { return atanhd4(v); } + realvec cbrt() const { return cbrtd4(v); } + realvec ceil() const { return vec_ceil(v); } + realvec copysign(realvec y) const { return vec_cpsgn(y.v, v); } + realvec cos() const { return cosd4(v); } + realvec cosh() const { return coshd4(v); } + realvec exp() const { return expd4(v); } + realvec exp10() const { return exp10d4(v); } + realvec exp2() const { return exp2d4(v); } + realvec expm1() const { return expm1d4(v); } + realvec fabs() const { return vec_abs(v); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const { return vec_floor(v); } + realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); } + realvec fmax(realvec y) const { return MF::vml_fmax(v, y.v); } + realvec fmin(realvec y) const { return MF::vml_fmin(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); } + realvec hypot(realvec y) const { return hypotd4(v, y.v); } + intvec_t ilogb() const { + // int_t ilogb_[] = { + // ::ilogb((*this)[0]), + // ::ilogb((*this)[1]), + // ::ilogb((*this)[2]), + // ::ilogb((*this)[3]) + // }; + // return intvec_t(ilogb_); + return MF::vml_ilogb(v); + } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { #ifdef VML_HAVE_NAN - return vec_tstnan(v, v); + return vec_tstnan(v, v); #else - return BV(false); + return BV(false); #endif - } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return ldexp(intvec_t(n)); } - realvec ldexp(intvec_t n) const - { - real_t ldexp_[] = { - vml_std::ldexp((*this)[0], n[0]), - vml_std::ldexp((*this)[1], n[1]), - vml_std::ldexp((*this)[2], n[2]), - vml_std::ldexp((*this)[3], n[3]) - }; - return realvec_t(ldexp_); - } - realvec log() const { return logd4(v); } - realvec log10() const { return log10d4(v); } - realvec log1p() const { return log1pd4(v); } - realvec log2() const { return log2d4(v); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return MF::vml_mad(*this, y, z); - } - realvec nextafter(realvec y) const - { - return MF::vml_nextafter(*this, y); - } - realvec pow(realvec y) const { return powd4(v, y.v); } - realvec rcp() const { return recip_fastd4(v); } - realvec remainder(realvec y) const - { - return MF::vml_remainder(*this, y); - } - realvec rint() const - { - return MF::vml_rint(*this); - // This is tempting, but seems too invasive - // #ifdef VML_HAVE_FP_CONTRACT - // return MF::vml_rint(*this); - // #else - // return vec_round(v); // use round instead of rint - // #endif - } - realvec round() const { return vec_round(v); } - realvec rsqrt() const - { - realvec x = *this; - realvec r = vec_rsqrte(x.v); // this is only an approximation - // TODO: use fma - // two Newton iterations (see vml_rsqrt) - r += RV(0.5)*r * (RV(1.0) - x * r*r); - r += RV(0.5)*r * (RV(1.0) - x * r*r); - return r; - } - boolvec_t signbit() const - { - return !RV(1.0).copysign(*this).as_int().as_bool(); - } - realvec sin() const { return sind4(v); } - realvec sinh() const { return sinhd4(v); } - realvec sqrt() const - { - // return vec_sqrtsw_nochk(v); - return *this * rsqrt(); - } - realvec tan() const { return tand4(v); } - realvec tanh() const { return tanhd4(v); } - realvec trunc() const { return vec_trunc(v); } - }; - - - - // boolvec definitions - - inline intvec<double,4> boolvec<double,4>::as_int() const - { - return v; - } - - inline intvec<double,4> boolvec<double,4>::convert_int() const - { - return ifthen(IV(I(1)), IV(I(0))); - } - - inline - boolvec<double,4> - boolvec<double,4>::ifthen(boolvec_t x, boolvec_t y) const - { - return ifthen(x.as_int(), y.as_int()).as_bool(); - } - - inline - intvec<double,4> - boolvec<double,4>::ifthen(intvec_t x, intvec_t y) const - { - return ifthen(x.as_float(), y.as_float()).as_int(); - } - - inline - realvec<double,4> - boolvec<double,4>::ifthen(realvec_t x, realvec_t y) const - { - return vec_sel(y.v, x.v, v); - } - - - - // intvec definitions - - inline intvec<double,4> intvec<double,4>::abs() const - { - return MF::vml_abs(*this); - } - - inline realvec<double,4> intvec<double,4>::as_float() const - { - return v; - } - - inline intvec<double,4> intvec<double,4>::bitifthen(intvec_t x, - intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - inline intvec<double,4> intvec<double,4>::clz() const - { - return MF::vml_clz(*this); - } - - inline realvec<double,4> intvec<double,4>::convert_float() const - { - return vec_cfid(v); - } - - inline boolvec<double,4> intvec<double,4>::isignbit() const - { - return MF::vml_isignbit(*this); - } - - inline intvec<double,4> intvec<double,4>::max(intvec_t x) const - { - return MF::vml_max(*this, x); - } - - inline intvec<double,4> intvec<double,4>::min(intvec_t x) const - { - return MF::vml_min(*this, x); - } - - inline intvec<double,4> intvec<double,4>::popcount() const - { - return MF::vml_popcount(*this); - } - - inline intvec<double,4> intvec<double,4>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); - } - - inline intvec<double,4> intvec<double,4>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); - } - + } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return ldexp(intvec_t(n)); } + realvec ldexp(intvec_t n) const { + real_t ldexp_[] = { + vml_std::ldexp((*this)[0], n[0]), vml_std::ldexp((*this)[1], n[1]), + vml_std::ldexp((*this)[2], n[2]), vml_std::ldexp((*this)[3], n[3])}; + return realvec_t(ldexp_); + } + realvec log() const { return logd4(v); } + realvec log10() const { return log10d4(v); } + realvec log1p() const { return log1pd4(v); } + realvec log2() const { return log2d4(v); } + realvec_t mad(realvec_t y, realvec_t z) const { + return MF::vml_mad(*this, y, z); + } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return powd4(v, y.v); } + realvec rcp() const { return recip_fastd4(v); } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const { + return MF::vml_rint(*this); + // This is tempting, but seems too invasive + // #ifdef VML_HAVE_FP_CONTRACT + // return MF::vml_rint(*this); + // #else + // return vec_round(v); // use round instead of rint + // #endif + } + realvec round() const { return vec_round(v); } + realvec rsqrt() const { + realvec x = *this; + realvec r = vec_rsqrte(x.v); // this is only an approximation + // TODO: use fma + // two Newton iterations (see vml_rsqrt) + r += RV(0.5) * r * (RV(1.0) - x * r * r); + r += RV(0.5) * r * (RV(1.0) - x * r * r); + return r; + } + boolvec_t signbit() const { + return !RV(1.0).copysign(*this).as_int().as_bool(); + } + realvec sin() const { return sind4(v); } + realvec sinh() const { return sinhd4(v); } + realvec sqrt() const { + // return vec_sqrtsw_nochk(v); + return *this * rsqrt(); + } + realvec tan() const { return tand4(v); } + realvec tanh() const { return tanhd4(v); } + realvec trunc() const { return vec_trunc(v); } +}; + +// boolvec definitions + +inline intvec<double, 4> boolvec<double, 4>::as_int() const { return v; } + +inline intvec<double, 4> boolvec<double, 4>::convert_int() const { + return ifthen(IV(I(1)), IV(I(0))); +} + +inline boolvec<double, 4> boolvec<double, 4>::ifthen(boolvec_t x, + boolvec_t y) const { + return ifthen(x.as_int(), y.as_int()).as_bool(); +} + +inline intvec<double, 4> boolvec<double, 4>::ifthen(intvec_t x, + intvec_t y) const { + return ifthen(x.as_float(), y.as_float()).as_int(); +} + +inline realvec<double, 4> boolvec<double, 4>::ifthen(realvec_t x, + realvec_t y) const { + return vec_sel(y.v, x.v, v); +} + +// intvec definitions + +inline intvec<double, 4> intvec<double, 4>::abs() const { + return MF::vml_abs(*this); +} + +inline realvec<double, 4> intvec<double, 4>::as_float() const { return v; } + +inline intvec<double, 4> intvec<double, 4>::bitifthen(intvec_t x, + intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); +} + +inline intvec<double, 4> intvec<double, 4>::clz() const { + return MF::vml_clz(*this); +} + +inline realvec<double, 4> intvec<double, 4>::convert_float() const { + return vec_cfid(v); +} + +inline boolvec<double, 4> intvec<double, 4>::isignbit() const { + return MF::vml_isignbit(*this); +} + +inline intvec<double, 4> intvec<double, 4>::max(intvec_t x) const { + return MF::vml_max(*this, x); +} + +inline intvec<double, 4> intvec<double, 4>::min(intvec_t x) const { + return MF::vml_min(*this, x); +} + +inline intvec<double, 4> intvec<double, 4>::popcount() const { + return MF::vml_popcount(*this); +} + +inline intvec<double, 4> intvec<double, 4>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +inline intvec<double, 4> intvec<double, 4>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + } // namespace vecmathlib -#endif // #ifndef VEC_QPX_DOUBLE4_H +#endif // #ifndef VEC_QPX_DOUBLE4_H diff --git a/vec_sse_double1.h b/vec_sse_double1.h index 5558356..d727de8 100644 --- a/vec_sse_double1.h +++ b/vec_sse_double1.h @@ -12,589 +12,493 @@ // SSE2 intrinsics #include <emmintrin.h> -#ifdef __SSE3__ // Intel's SSE 3 -# include <pmmintrin.h> +#ifdef __SSE3__ // Intel's SSE 3 +#include <pmmintrin.h> #endif -#ifdef __SSE4_1__ // Intel's SSE 4.1 -# include <smmintrin.h> +#ifdef __SSE4_1__ // Intel's SSE 4.1 +#include <smmintrin.h> #endif -#ifdef __SSE4A__ // AMD's SSE 4a -# include <ammintrin.h> +#ifdef __SSE4A__ // AMD's SSE 4a +#include <ammintrin.h> #endif -#if defined __AVX__ // Intel's AVX -# include <immintrin.h> +#if defined __AVX__ // Intel's AVX +#include <immintrin.h> #endif - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_DOUBLE_1 - template<> struct boolvec<double,1>; - template<> struct intvec<double,1>; - template<> struct realvec<double,1>; - - - - template<> - struct boolvec<double,1>: floatprops<double> - { - static int const size = 1; - typedef bool scalar_t; - typedef uint_t bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - // true values are non-zero, false values are zero - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(a) {} - boolvec(bool const* as): v(as[0]) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const { return v; } - boolvec_t& set_elt(int n, bool a) { return v=a, *this; } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec_t operator!() const { return !v; } - - boolvec_t operator&&(boolvec_t x) const { return v && x.v; } - boolvec_t operator||(boolvec_t x) const { return v || x.v; } - boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); } - boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); } - - bool all() const { return *this; } - bool any() const { return *this; } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<double,1>: floatprops<double> - { - static int const size = 1; - typedef int_t scalar_t; - typedef int_t ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(int_t a): v(a) {} - intvec(int_t const* as): v(as[0]) {} - static intvec_t iota() { return intvec(I(0)); } - - operator ivector_t() const { return v; } - int_t operator[](int n) const { return v; } - intvec_t& set_elt(int n, int_t a) { return v=a, *this; } - - - - boolvec_t as_bool() const { return U(v); } - boolvec_t convert_bool() const { return bool(v); } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - intvec_t operator+() const { return +v; } - intvec_t operator-() const { return -v; } - - intvec_t operator+(intvec_t x) const { return v+x.v; } - intvec_t operator-(intvec_t x) const { return v-x.v; } - intvec_t operator*(intvec_t x) const { return v*x.v; } - intvec_t operator/(intvec_t x) const { return v/x.v; } - intvec_t operator%(intvec_t x) const { return v%x.v; } - - intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; } - intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; } - intvec_t& operator*=(intvec_t const& x) { return *this=*this*x; } - intvec_t& operator/=(intvec_t const& x) { return *this=*this/x; } - intvec_t& operator%=(intvec_t const& x) { return *this=*this%x; } - - - - intvec_t operator~() const { return ~v; } - - intvec_t operator&(intvec_t x) const { return v&x.v; } - intvec_t operator|(intvec_t x) const { return v|x.v; } - intvec_t operator^(intvec_t x) const { return v^x.v; } - - intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; } - intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; } - intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const; - - - - intvec_t lsr(int_t n) const { return U(v) >> U(n); } - intvec_t rotate(int_t n) const; - intvec_t operator>>(int_t n) const { return v>>n; } - intvec_t operator<<(int_t n) const { return v<<n; } - - intvec_t& operator>>=(int_t n) { return *this=*this>>n; } - intvec_t& operator<<=(int_t n) { return *this=*this<<n; } - - intvec_t lsr(intvec_t n) const { return U(v) >> U(n); } - intvec_t rotate(intvec_t n) const; - intvec_t operator>>(intvec_t n) const { return v>>n; } - intvec_t operator<<(intvec_t n) const { return v<<n; } - - intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; } - intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; } - - intvec_t clz() const { return __builtin_clzll(v); } - intvec_t popcount() const { return __builtin_popcountll(v); } - - - - boolvec_t operator==(intvec_t const& x) const { return v==x.v; } - boolvec_t operator!=(intvec_t const& x) const { return v!=x.v; } - boolvec_t operator<(intvec_t const& x) const { return v<x.v; } - boolvec_t operator<=(intvec_t const& x) const { return v<=x.v; } - boolvec_t operator>(intvec_t const& x) const { return v>x.v; } - boolvec_t operator>=(intvec_t const& x) const { return v>=x.v; } - - intvec_t abs() const { return std::abs(v); } - boolvec_t isignbit() const { return v<0; } - intvec_t max(intvec_t x) const { return std::max(v, x.v); } - intvec_t min(intvec_t x) const { return std::min(v, x.v); } - }; - - - - template<> - struct realvec<double,1>: floatprops<double> - { - static int const size = 1; - typedef real_t scalar_t; - typedef double vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return "<SSE2:1*double>"; } - void barrier() { __asm__("": "+x"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - private: - static __m128d from_double(double a) { return _mm_set_sd(a); } - static double to_double(__m128d a) { return _mm_cvtsd_f64(a); } - public: - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(real_t a): v(a) {} - realvec(real_t const* as): v(as[0]) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const { return v; } - realvec_t& set_elt(int n, real_t a) { return v=a, *this; } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return *p; - } - static realvec_t loadu(real_t const* p) - { - return *p; - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loada(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return *this; - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return *this; - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loada(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - *p = v; - } - void storeu(real_t* p) const - { - *p = v; - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storea(p+ioff); +template <> struct boolvec<double, 1>; +template <> struct intvec<double, 1>; +template <> struct realvec<double, 1>; + +template <> struct boolvec<double, 1> : floatprops<double> { + static int const size = 1; + typedef bool scalar_t; + typedef uint_t bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + // true values are non-zero, false values are zero + + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v(a) {} + boolvec(bool const *as) : v(as[0]) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return v; } + boolvec_t &set_elt(int n, bool a) { return v = a, *this; } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec_t operator!() const { return !v; } + + boolvec_t operator&&(boolvec_t x) const { return v && x.v; } + boolvec_t operator||(boolvec_t x) const { return v || x.v; } + boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); } + boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); } + + bool all() const { return *this; } + bool any() const { return *this; } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<double, 1> : floatprops<double> { + static int const size = 1; + typedef int_t scalar_t; + typedef int_t ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(int_t a) : v(a) {} + intvec(int_t const *as) : v(as[0]) {} + static intvec_t iota() { return intvec(I(0)); } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return v; } + intvec_t &set_elt(int n, int_t a) { return v = a, *this; } + + boolvec_t as_bool() const { return U(v); } + boolvec_t convert_bool() const { return bool(v); } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + intvec_t operator+() const { return +v; } + intvec_t operator-() const { return -v; } + + intvec_t operator+(intvec_t x) const { return v + x.v; } + intvec_t operator-(intvec_t x) const { return v - x.v; } + intvec_t operator*(intvec_t x) const { return v * x.v; } + intvec_t operator/(intvec_t x) const { return v / x.v; } + intvec_t operator%(intvec_t x) const { return v % x.v; } + + intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; } + intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; } + intvec_t &operator*=(intvec_t const &x) { return *this = *this * x; } + intvec_t &operator/=(intvec_t const &x) { return *this = *this / x; } + intvec_t &operator%=(intvec_t const &x) { return *this = *this % x; } + + intvec_t operator~() const { return ~v; } + + intvec_t operator&(intvec_t x) const { return v & x.v; } + intvec_t operator|(intvec_t x) const { return v | x.v; } + intvec_t operator^(intvec_t x) const { return v ^ x.v; } + + intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; } + intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; } + intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const; + + intvec_t lsr(int_t n) const { return U(v) >> U(n); } + intvec_t rotate(int_t n) const; + intvec_t operator>>(int_t n) const { return v >> n; } + intvec_t operator<<(int_t n) const { return v << n; } + + intvec_t &operator>>=(int_t n) { return *this = *this >> n; } + intvec_t &operator<<=(int_t n) { return *this = *this << n; } + + intvec_t lsr(intvec_t n) const { return U(v) >> U(n); } + intvec_t rotate(intvec_t n) const; + intvec_t operator>>(intvec_t n) const { return v >> n; } + intvec_t operator<<(intvec_t n) const { return v << n; } + + intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; } + intvec_t &operator<<=(intvec_t n) { return *this = *this << n; } + + intvec_t clz() const { return __builtin_clzll(v); } + intvec_t popcount() const { return __builtin_popcountll(v); } + + boolvec_t operator==(intvec_t const &x) const { return v == x.v; } + boolvec_t operator!=(intvec_t const &x) const { return v != x.v; } + boolvec_t operator<(intvec_t const &x) const { return v < x.v; } + boolvec_t operator<=(intvec_t const &x) const { return v <= x.v; } + boolvec_t operator>(intvec_t const &x) const { return v > x.v; } + boolvec_t operator>=(intvec_t const &x) const { return v >= x.v; } + + intvec_t abs() const { return std::abs(v); } + boolvec_t isignbit() const { return v < 0; } + intvec_t max(intvec_t x) const { return std::max(v, x.v); } + intvec_t min(intvec_t x) const { return std::min(v, x.v); } +}; + +template <> struct realvec<double, 1> : floatprops<double> { + static int const size = 1; + typedef real_t scalar_t; + typedef double vector_t; + static int const alignment = sizeof(vector_t); + + static char const *name() { return "<SSE2:1*double>"; } + void barrier() { __asm__("" : "+x"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + +private: + static __m128d from_double(double a) { return _mm_set_sd(a); } + static double to_double(__m128d a) { return _mm_cvtsd_f64(a); } + +public: + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(real_t a) : v(a) {} + realvec(real_t const *as) : v(as[0]) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { return v; } + realvec_t &set_elt(int n, real_t a) { return v = a, *this; } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return *p; + } + static realvec_t loadu(real_t const *p) { return *p; } + static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loada(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return *this; } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return *this; } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } + } + realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loada(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + *p = v; + } + void storeu(real_t *p) const { *p = v; } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storea(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storea(p+ioff, m); + } + void storeu(real_t *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); } - - - - intvec_t as_int() const { return floatprops::as_int(v); } - intvec_t convert_int() const { + } + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storea(p + ioff, m); + } + + intvec_t as_int() const { return floatprops::as_int(v); } + intvec_t convert_int() const { #ifdef __x86_64__ - return _mm_cvttsd_si64(_mm_set_sd(v)); + return _mm_cvttsd_si64(_mm_set_sd(v)); #else - return floatprops::convert_int(v); + return floatprops::convert_int(v); #endif - } - - - - realvec_t operator+() const { return +v; } - realvec_t operator-() const { return -v; } - - realvec_t operator+(realvec_t x) const { return v+x.v; } - realvec_t operator-(realvec_t x) const { return v-x.v; } - realvec_t operator*(realvec_t x) const { return v*x.v; } - realvec_t operator/(realvec_t x) const { return v/x.v; } - - realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; } - realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; } - realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; } - realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; } - - real_t maxval() const { return *this; } - real_t minval() const { return *this; } - real_t prod() const { return *this; } - real_t sum() const { return *this; } - - - - boolvec_t operator==(realvec_t const& x) const { return v==x.v; } - boolvec_t operator!=(realvec_t const& x) const { return v!=x.v; } - boolvec_t operator<(realvec_t const& x) const { return v<x.v; } - boolvec_t operator<=(realvec_t const& x) const { return v<=x.v; } - boolvec_t operator>(realvec_t const& x) const { return v>x.v; } - boolvec_t operator>=(realvec_t const& x) const { return v>=x.v; } - - - - realvec_t acos() const { return MF::vml_acos(*this); } - realvec_t acosh() const { return MF::vml_acosh(*this); } - realvec_t asin() const { return MF::vml_asin(*this); } - realvec_t asinh() const { return MF::vml_asinh(*this); } - realvec_t atan() const { return MF::vml_atan(*this); } - realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } - realvec_t atanh() const { return MF::vml_atanh(*this); } - realvec_t cbrt() const { return MF::vml_cbrt(*this); } - realvec_t ceil() const - { + } + + realvec_t operator+() const { return +v; } + realvec_t operator-() const { return -v; } + + realvec_t operator+(realvec_t x) const { return v + x.v; } + realvec_t operator-(realvec_t x) const { return v - x.v; } + realvec_t operator*(realvec_t x) const { return v * x.v; } + realvec_t operator/(realvec_t x) const { return v / x.v; } + + realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; } + realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; } + realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; } + realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; } + + real_t maxval() const { return *this; } + real_t minval() const { return *this; } + real_t prod() const { return *this; } + real_t sum() const { return *this; } + + boolvec_t operator==(realvec_t const &x) const { return v == x.v; } + boolvec_t operator!=(realvec_t const &x) const { return v != x.v; } + boolvec_t operator<(realvec_t const &x) const { return v < x.v; } + boolvec_t operator<=(realvec_t const &x) const { return v <= x.v; } + boolvec_t operator>(realvec_t const &x) const { return v > x.v; } + boolvec_t operator>=(realvec_t const &x) const { return v >= x.v; } + + realvec_t acos() const { return MF::vml_acos(*this); } + realvec_t acosh() const { return MF::vml_acosh(*this); } + realvec_t asin() const { return MF::vml_asin(*this); } + realvec_t asinh() const { return MF::vml_asinh(*this); } + realvec_t atan() const { return MF::vml_atan(*this); } + realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } + realvec_t atanh() const { return MF::vml_atanh(*this); } + realvec_t cbrt() const { return MF::vml_cbrt(*this); } + realvec_t ceil() const { #ifdef __SSE4_1__ - return to_double(_mm_ceil_sd(from_double(v), from_double(v))); + return to_double(_mm_ceil_sd(from_double(v), from_double(v))); #else - return vml_std::ceil(v); + return vml_std::ceil(v); #endif - } - realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); } - realvec_t cos() const { return MF::vml_cos(*this); } - realvec_t cosh() const { return MF::vml_cosh(*this); } - realvec_t exp() const { return MF::vml_exp(*this); } - realvec_t exp10() const { return MF::vml_exp10(*this); } - realvec_t exp2() const { return MF::vml_exp2(*this); } - realvec_t expm1() const { return MF::vml_expm1(*this); } - realvec_t fabs() const { return vml_std::fabs(v); } - realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } - realvec_t floor() const - { + } + realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); } + realvec_t cos() const { return MF::vml_cos(*this); } + realvec_t cosh() const { return MF::vml_cosh(*this); } + realvec_t exp() const { return MF::vml_exp(*this); } + realvec_t exp10() const { return MF::vml_exp10(*this); } + realvec_t exp2() const { return MF::vml_exp2(*this); } + realvec_t expm1() const { return MF::vml_expm1(*this); } + realvec_t fabs() const { return vml_std::fabs(v); } + realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } + realvec_t floor() const { #ifdef __SSE4_1__ - return to_double(_mm_floor_sd(from_double(v), from_double(v))); + return to_double(_mm_floor_sd(from_double(v), from_double(v))); #else - return vml_std::floor(v); + return vml_std::floor(v); #endif - } - realvec_t fma(realvec_t y, realvec_t z) const - { - return MF::vml_fma(*this, y, z); - } - realvec_t fmax(realvec_t y) const - { - return to_double(_mm_max_sd(from_double(v), from_double(y.v))); - } - realvec_t fmin(realvec_t y) const - { - return to_double(_mm_min_sd(from_double(v), from_double(y.v))); - } - realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); } - realvec_t frexp(intvec_t* irp) const - { - int iri; - realvec_t r = vml_std::frexp(v, &iri); - int_t ir = iri; - if (isinf()) ir = std::numeric_limits<int_t>::max(); - if (isnan()) ir = std::numeric_limits<int_t>::min(); - irp->v = ir; - return r; - } - realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const - { - int_t r = vml_std::ilogb(v); - typedef std::numeric_limits<int_t> NL; - if (FP_ILOGB0 != NL::min() and v == R(0.0)) { - r = NL::min(); + } + realvec_t fma(realvec_t y, realvec_t z) const { + return MF::vml_fma(*this, y, z); + } + realvec_t fmax(realvec_t y) const { + return to_double(_mm_max_sd(from_double(v), from_double(y.v))); + } + realvec_t fmin(realvec_t y) const { + return to_double(_mm_min_sd(from_double(v), from_double(y.v))); + } + realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); } + realvec_t frexp(intvec_t *irp) const { + int iri; + realvec_t r = vml_std::frexp(v, &iri); + int_t ir = iri; + if (isinf()) + ir = std::numeric_limits<int_t>::max(); + if (isnan()) + ir = std::numeric_limits<int_t>::min(); + irp->v = ir; + return r; + } + realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { + int_t r = vml_std::ilogb(v); + typedef std::numeric_limits<int_t> NL; + if (FP_ILOGB0 != NL::min() and v == R(0.0)) { + r = NL::min(); #if defined VML_HAVE_INF - } else if (INT_MAX != NL::max() and vml_std::isinf(v)) { - r = NL::max(); + } else if (INT_MAX != NL::max() and vml_std::isinf(v)) { + r = NL::max(); #endif #if defined VML_HAVE_NAN - } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v)) { - r = NL::min(); + } else if (FP_ILOGBNAN != NL::min() and vml_std::isnan(v)) { + r = NL::min(); #endif - } - return r; - } - boolvec_t isfinite() const { return vml_std::isfinite(v); } - boolvec_t isinf() const { return vml_std::isinf(v); } - boolvec_t isnan() const - { - // This is wrong: - // return _mm_ucomineq_sd(from_double(v), from_double(v)); - // This works: - // char r; - // __asm__("ucomisd %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v)); - // return boolvec_t::scalar_t(r); - // This works as well: - return vml_std::isnan(v); - } - boolvec_t isnormal() const { return vml_std::isnormal(v); } - realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); } - realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); } - realvec_t log() const { return MF::vml_log(*this); } - realvec_t log10() const { return MF::vml_log10(*this); } - realvec_t log1p() const { return MF::vml_log1p(*this); } - realvec_t log2() const { return MF::vml_log2(*this); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return MF::vml_mad(*this, y, z); - } - realvec_t nextafter(realvec_t y) const - { - return MF::vml_nextafter(*this, y); - } - realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } - realvec_t rcp() const { return R(1.0)/v; } - realvec_t remainder(realvec_t y) const - { - return vml_std::remainder(v, y.v); } - realvec_t rint() const - { + return r; + } + boolvec_t isfinite() const { return vml_std::isfinite(v); } + boolvec_t isinf() const { return vml_std::isinf(v); } + boolvec_t isnan() const { + // This is wrong: + // return _mm_ucomineq_sd(from_double(v), from_double(v)); + // This works: + // char r; + // __asm__("ucomisd %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v)); + // return boolvec_t::scalar_t(r); + // This works as well: + return vml_std::isnan(v); + } + boolvec_t isnormal() const { return vml_std::isnormal(v); } + realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); } + realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); } + realvec_t log() const { return MF::vml_log(*this); } + realvec_t log10() const { return MF::vml_log10(*this); } + realvec_t log1p() const { return MF::vml_log1p(*this); } + realvec_t log2() const { return MF::vml_log2(*this); } + realvec_t mad(realvec_t y, realvec_t z) const { + return MF::vml_mad(*this, y, z); + } + realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); } + realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } + realvec_t rcp() const { return R(1.0) / v; } + realvec_t remainder(realvec_t y) const { return vml_std::remainder(v, y.v); } + realvec_t rint() const { #ifdef __SSE4_1__ - return to_double(_mm_round_sd(from_double(v), from_double(v), - _MM_FROUND_TO_NEAREST_INT)); + return to_double(_mm_round_sd(from_double(v), from_double(v), + _MM_FROUND_TO_NEAREST_INT)); #else - return MF::vml_rint(*this); + return MF::vml_rint(*this); #endif - } - realvec_t round() const { return MF::vml_round(*this); } - realvec_t rsqrt() const { return MF::vml_rsqrt(*this); } - boolvec_t signbit() const { return vml_std::signbit(v); } - realvec_t sin() const { return MF::vml_sin(*this); } - realvec_t sinh() const { return MF::vml_sinh(*this); } - realvec_t sqrt() const - { - return to_double(_mm_sqrt_sd(from_double(v), from_double(v))); - } - realvec_t tan() const { return MF::vml_tan(*this); } - realvec_t tanh() const { return MF::vml_tanh(*this); } - realvec_t trunc() const - { + } + realvec_t round() const { return MF::vml_round(*this); } + realvec_t rsqrt() const { return MF::vml_rsqrt(*this); } + boolvec_t signbit() const { return vml_std::signbit(v); } + realvec_t sin() const { return MF::vml_sin(*this); } + realvec_t sinh() const { return MF::vml_sinh(*this); } + realvec_t sqrt() const { + return to_double(_mm_sqrt_sd(from_double(v), from_double(v))); + } + realvec_t tan() const { return MF::vml_tan(*this); } + realvec_t tanh() const { return MF::vml_tanh(*this); } + realvec_t trunc() const { #ifdef __SSE4_1__ - return to_double(_mm_round_sd(from_double(v), from_double(v), - _MM_FROUND_TO_ZERO)); + return to_double( + _mm_round_sd(from_double(v), from_double(v), _MM_FROUND_TO_ZERO)); #else - return MF::vml_trunc(*this); + return MF::vml_trunc(*this); #endif - } - }; - - - - // boolvec definitions - - inline intvec<double,1> boolvec<double,1>::as_int() const - { - return I(v); } - - inline intvec<double,1> boolvec<double,1>::convert_int() const - { - return v; - } - - inline - boolvec<double,1> boolvec<double,1>::ifthen(boolvec_t x, boolvec_t y) const - { - return v ? x : y; - } - - inline - intvec<double,1> boolvec<double,1>::ifthen(intvec_t x, intvec_t y) const - { - return v ? x : y; - } - - inline - realvec<double,1> boolvec<double,1>::ifthen(realvec_t x, realvec_t y) const - { - return v ? x : y; - } - - - - // intvec definitions - - inline realvec<double,1> intvec<double,1>::as_float() const - { - return FP::as_float(v); - } - - inline realvec<double,1> intvec<double,1>::convert_float() const - { +}; + +// boolvec definitions + +inline intvec<double, 1> boolvec<double, 1>::as_int() const { return I(v); } + +inline intvec<double, 1> boolvec<double, 1>::convert_int() const { return v; } + +inline boolvec<double, 1> boolvec<double, 1>::ifthen(boolvec_t x, + boolvec_t y) const { + return v ? x : y; +} + +inline intvec<double, 1> boolvec<double, 1>::ifthen(intvec_t x, + intvec_t y) const { + return v ? x : y; +} + +inline realvec<double, 1> boolvec<double, 1>::ifthen(realvec_t x, + realvec_t y) const { + return v ? x : y; +} + +// intvec definitions + +inline realvec<double, 1> intvec<double, 1>::as_float() const { + return FP::as_float(v); +} + +inline realvec<double, 1> intvec<double, 1>::convert_float() const { #ifdef __x86_64__ - return _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(), v)); + return _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(), v)); #else - return FP::convert_float(v); + return FP::convert_float(v); #endif - } - - inline intvec<double,1> intvec<double,1>::bitifthen(intvec_t x, - intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - inline intvec<double,1> intvec<double,1>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); - } - - inline intvec<double,1> intvec<double,1>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); - } - +} + +inline intvec<double, 1> intvec<double, 1>::bitifthen(intvec_t x, + intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); +} + +inline intvec<double, 1> intvec<double, 1>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +inline intvec<double, 1> intvec<double, 1>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + } // namespace vecmathlib -#endif // #ifndef VEC_SSE_DOUBLE1_H +#endif // #ifndef VEC_SSE_DOUBLE1_H diff --git a/vec_sse_double2.h b/vec_sse_double2.h index 5d64688..095f458 100644 --- a/vec_sse_double2.h +++ b/vec_sse_double2.h @@ -11,737 +11,600 @@ // SSE2 intrinsics #include <emmintrin.h> -#ifdef __SSE3__ // Intel's SSE 3 -# include <pmmintrin.h> +#ifdef __SSE3__ // Intel's SSE 3 +#include <pmmintrin.h> #endif -#ifdef __SSE4_1__ // Intel's SSE 4.1 -# include <smmintrin.h> +#ifdef __SSE4_1__ // Intel's SSE 4.1 +#include <smmintrin.h> #endif -#ifdef __SSE4A__ // AMD's SSE 4a -# include <ammintrin.h> +#ifdef __SSE4A__ // AMD's SSE 4a +#include <ammintrin.h> #endif -#if defined __AVX__ // Intel's AVX -# include <immintrin.h> +#if defined __AVX__ // Intel's AVX +#include <immintrin.h> #endif - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_DOUBLE_2 - template<> struct boolvec<double,2>; - template<> struct intvec<double,2>; - template<> struct realvec<double,2>; - - - - template<> - struct boolvec<double,2>: floatprops<double> - { - static int const size = 2; - typedef bool scalar_t; - typedef __m128d bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values have the sign bit set, false values have it unset - static uint_t from_bool(bool a) { return - uint_t(a); } - static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } - public: - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): - v(_mm_castsi128_pd(_mm_set1_epi64x(from_bool(a)))) {} - boolvec(bool const* as): - v(_mm_castsi128_pd(_mm_set_epi64x(from_bool(as[1]), from_bool(as[0])))) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const - { - return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n)); - } - boolvec_t& set_elt(int n, bool a) - { - return - vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec_t operator!() const { return _mm_xor_pd(boolvec(true), v); } - - boolvec_t operator&&(boolvec_t x) const { return _mm_and_pd(v, x.v); } - boolvec_t operator||(boolvec_t x) const { return _mm_or_pd(v, x.v); } - boolvec_t operator==(boolvec_t x) const { return !(*this!=x); } - boolvec_t operator!=(boolvec_t x) const { return _mm_xor_pd(v, x.v); } - - bool all() const - { +template <> struct boolvec<double, 2>; +template <> struct intvec<double, 2>; +template <> struct realvec<double, 2>; + +template <> struct boolvec<double, 2> : floatprops<double> { + static int const size = 2; + typedef bool scalar_t; + typedef __m128d bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + +private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return -uint_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + +public: + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v(_mm_castsi128_pd(_mm_set1_epi64x(from_bool(a)))) {} + boolvec(bool const *as) + : v(_mm_castsi128_pd( + _mm_set_epi64x(from_bool(as[1]), from_bool(as[0])))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { + return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n)); + } + boolvec_t &set_elt(int n, bool a) { + return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)), + *this; + } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec_t operator!() const { return _mm_xor_pd(boolvec(true), v); } + + boolvec_t operator&&(boolvec_t x) const { return _mm_and_pd(v, x.v); } + boolvec_t operator||(boolvec_t x) const { return _mm_or_pd(v, x.v); } + boolvec_t operator==(boolvec_t x) const { return !(*this != x); } + boolvec_t operator!=(boolvec_t x) const { return _mm_xor_pd(v, x.v); } + + bool all() const { #if defined __AVX__ - return ! (! *this).any(); + return !(!*this).any(); #else - return (*this)[0] && (*this)[1]; + return (*this)[0] && (*this)[1]; #endif - } - bool any() const - { + } + bool any() const { #if defined __AVX__ - return ! bool(_mm_testz_pd(v, v)); + return !bool(_mm_testz_pd(v, v)); #else - return (*this)[0] || (*this)[1]; + return (*this)[0] || (*this)[1]; #endif + } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<double, 2> : floatprops<double> { + static int const size = 2; + typedef int_t scalar_t; + typedef __m128i ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x) : v(x) {} + intvec(int_t a) : v(_mm_set1_epi64x(a)) {} + intvec(int_t const *as) : v(_mm_set_epi64x(as[1], as[0])) {} + static intvec_t iota() { return _mm_set_epi64x(1, 0); } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { + return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n); + } + intvec_t &set_elt(int n, int_t a) { + return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this; + } + + boolvec_t as_bool() const { return _mm_castsi128_pd(v); } + boolvec_t convert_bool() const { + // Result: convert_bool(0)=false, convert_bool(else)=true + // There is no intrinsic to compare to zero. Instead, we check + // whether x is positive and x-1 is negative. + intvec_t x = *this; + // We know that boolvec_t values depend only on the sign bit + // return (~(x-1) | x).as_bool(); + // return x.as_bool() || !(x-1).as_bool(); + return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + // Note: not all arithmetic operations are supported! + + intvec_t operator+() const { return *this; } + intvec_t operator-() const { return IV(I(0)) - *this; } + + intvec_t operator+(intvec_t x) const { return _mm_add_epi64(v, x.v); } + intvec_t operator-(intvec_t x) const { return _mm_sub_epi64(v, x.v); } + + intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; } + intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; } + + intvec_t operator~() const { return IV(~U(0)) ^ *this; } + + intvec_t operator&(intvec_t x) const { + return _mm_castpd_si128( + _mm_and_pd(_mm_castsi128_pd(v), _mm_castsi128_pd(x.v))); + } + intvec_t operator|(intvec_t x) const { + return _mm_castpd_si128( + _mm_or_pd(_mm_castsi128_pd(v), _mm_castsi128_pd(x.v))); + } + intvec_t operator^(intvec_t x) const { + return _mm_castpd_si128( + _mm_xor_pd(_mm_castsi128_pd(v), _mm_castsi128_pd(x.v))); + } + + intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; } + intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; } + intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const; + + intvec_t lsr(int_t n) const { return _mm_srli_epi64(v, n); } + intvec_t rotate(int_t n) const; + intvec_t operator>>(int_t n) const { + // There is no _mm_srai_epi64. To emulate it, add 0x80000000 + // before shifting, and subtract the shifted 0x80000000 after + // shifting + intvec_t x = *this; + // Convert signed to unsiged + x += U(1) << (bits - 1); + // Shift + x = x.lsr(n); + // Undo conversion + x -= U(1) << (bits - 1 - n); + return x; + } + intvec_t operator<<(int_t n) const { return _mm_slli_epi64(v, n); } + intvec_t &operator>>=(int_t n) { return *this = *this >> n; } + intvec_t &operator<<=(int_t n) { return *this = *this << n; } + + intvec_t lsr(intvec_t n) const { + intvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, U((*this)[i]) >> U(n[i])); } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<double,2>: floatprops<double> - { - static int const size = 2; - typedef int_t scalar_t; - typedef __m128i ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm_set1_epi64x(a)) {} - intvec(int_t const* as): v(_mm_set_epi64x(as[1], as[0])) {} - static intvec_t iota() { return _mm_set_epi64x(1, 0); } - - operator ivector_t() const { return v; } - int_t operator[](int n) const - { - return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n); - } - intvec_t& set_elt(int n, int_t a) - { - return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this; - } - - - - boolvec_t as_bool() const { return _mm_castsi128_pd(v); } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare to zero. Instead, we check - // whether x is positive and x-1 is negative. - intvec_t x = *this; - // We know that boolvec_t values depend only on the sign bit - // return (~(x-1) | x).as_bool(); - // return x.as_bool() || !(x-1).as_bool(); - return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec_t operator+() const { return *this; } - intvec_t operator-() const { return IV(I(0)) - *this; } - - intvec_t operator+(intvec_t x) const { return _mm_add_epi64(v, x.v); } - intvec_t operator-(intvec_t x) const { return _mm_sub_epi64(v, x.v); } - - intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; } - intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; } - - - - intvec_t operator~() const { return IV(~U(0)) ^ *this; } - - intvec_t operator&(intvec_t x) const - { - return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(v), - _mm_castsi128_pd(x.v))); - } - intvec_t operator|(intvec_t x) const - { - return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(v), - _mm_castsi128_pd(x.v))); - } - intvec_t operator^(intvec_t x) const - { - return _mm_castpd_si128(_mm_xor_pd(_mm_castsi128_pd(v), - _mm_castsi128_pd(x.v))); - } - - intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; } - intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; } - intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const; - - - - intvec_t lsr(int_t n) const { return _mm_srli_epi64(v, n); } - intvec_t rotate(int_t n) const; - intvec_t operator>>(int_t n) const - { - // There is no _mm_srai_epi64. To emulate it, add 0x80000000 - // before shifting, and subtract the shifted 0x80000000 after - // shifting - intvec_t x = *this; - // Convert signed to unsiged - x += U(1) << (bits-1); - // Shift - x = x.lsr(n); - // Undo conversion - x -= U(1) << (bits-1-n); - return x; - } - intvec_t operator<<(int_t n) const { return _mm_slli_epi64(v, n); } - intvec_t& operator>>=(int_t n) { return *this=*this>>n; } - intvec_t& operator<<=(int_t n) { return *this=*this<<n; } - - intvec_t lsr(intvec_t n) const - { - intvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, U((*this)[i]) >> U(n[i])); - } - return r; - } - intvec_t rotate(intvec_t n) const; - intvec_t operator>>(intvec_t n) const - { - intvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] >> n[i]); - } - return r; - } - intvec_t operator<<(intvec_t n) const - { - intvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] << n[i]); - } - return r; - } - intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; } - intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; } - - intvec_t clz() const; - intvec_t popcount() const; - - - - boolvec_t operator==(intvec_t const& x) const - { - return ! (*this != x); - } - boolvec_t operator!=(intvec_t const& x) const - { - return (*this ^ x).convert_bool(); - } - boolvec_t operator<(intvec_t const& x) const - { - // return (*this - x).as_bool(); - boolvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] < x[i]); - } - return r; - } - boolvec_t operator<=(intvec_t const& x) const - { - return ! (*this > x); - } - boolvec_t operator>(intvec_t const& x) const - { - return x < *this; - } - boolvec_t operator>=(intvec_t const& x) const - { - return ! (*this < x); - } - - intvec_t abs() const; - boolvec_t isignbit() const { return as_bool(); } - intvec_t max(intvec_t x) const; - intvec_t min(intvec_t x) const; - }; - - - - template<> - struct realvec<double,2>: floatprops<double> - { - static int const size = 2; - typedef real_t scalar_t; - typedef __m128d vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return "<SSE2:2*double>"; } - void barrier() { __asm__("": "+x"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm_set1_pd(a)) {} - realvec(real_t const* as): v(_mm_set_pd(as[1], as[0])) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const - { - return vecmathlib::get_elt<RV,vector_t,real_t>(v, n); - } - realvec_t& set_elt(int n, real_t a) - { - return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this; - } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm_load_pd(p); - } - static realvec_t loadu(real_t const* p) - { - return _mm_loadu_pd(p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } + return r; + } + intvec_t rotate(intvec_t n) const; + intvec_t operator>>(intvec_t n) const { + intvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] >> n[i]); } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); + return r; + } + intvec_t operator<<(intvec_t n) const { + intvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] << n[i]); } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm_store_pd(p, v); + return r; + } + intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; } + intvec_t &operator<<=(intvec_t n) { return *this = *this << n; } + + intvec_t clz() const; + intvec_t popcount() const; + + boolvec_t operator==(intvec_t const &x) const { return !(*this != x); } + boolvec_t operator!=(intvec_t const &x) const { + return (*this ^ x).convert_bool(); + } + boolvec_t operator<(intvec_t const &x) const { + // return (*this - x).as_bool(); + boolvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] < x[i]); } - void storeu(real_t* p) const - { - return _mm_storeu_pd(p, v); + return r; + } + boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); } + boolvec_t operator>(intvec_t const &x) const { return x < *this; } + boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); } + + intvec_t abs() const; + boolvec_t isignbit() const { return as_bool(); } + intvec_t max(intvec_t x) const; + intvec_t min(intvec_t x) const; +}; + +template <> struct realvec<double, 2> : floatprops<double> { + static int const size = 2; + typedef real_t scalar_t; + typedef __m128d vector_t; + static int const alignment = sizeof(vector_t); + + static char const *name() { return "<SSE2:2*double>"; } + void barrier() { __asm__("" : "+x"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x) : v(x) {} + realvec(real_t a) : v(_mm_set1_pd(a)) {} + realvec(real_t const *as) : v(_mm_set_pd(as[1], as[0])) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { + return vecmathlib::get_elt<RV, vector_t, real_t>(v, n); + } + realvec_t &set_elt(int n, real_t a) { + return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this; + } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm_load_pd(p); + } + static realvec_t loadu(real_t const *p) { return _mm_loadu_pd(p); } + static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff); + return loadu(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { + } + realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff, m); + return loadu(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm_store_pd(p, v); + } + void storeu(real_t *p) const { return _mm_storeu_pd(p, v); } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff); + storeu(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { #if defined __AVX__ - _mm_maskstore_pd(p, m.m.as_int(), v); + _mm_maskstore_pd(p, m.m.as_int(), v); #else - if (m.m[0]) _mm_storel_pd(p , v); - else if (m.m[1]) _mm_storeh_pd(p+1, v); + if (m.m[0]) + _mm_storel_pd(p, v); + else if (m.m[1]) + _mm_storeh_pd(p + 1, v); #endif - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - if (m.m[0]) _mm_storel_pd(p , v); - else if (m.m[1]) _mm_storeh_pd(p+1, v); - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); } - - - - intvec_t as_int() const { return _mm_castpd_si128(v); } - intvec_t convert_int() const - { - intvec_t r; - r.set_elt(0, floatprops::convert_int((*this)[0])); - r.set_elt(1, floatprops::convert_int((*this)[1])); - return r; - } - - - - realvec_t operator+() const { return *this; } - realvec_t operator-() const { return RV(0.0) - *this; } - - realvec_t operator+(realvec_t x) const { return _mm_add_pd(v, x.v); } - realvec_t operator-(realvec_t x) const { return _mm_sub_pd(v, x.v); } - realvec_t operator*(realvec_t x) const { return _mm_mul_pd(v, x.v); } - realvec_t operator/(realvec_t x) const { return _mm_div_pd(v, x.v); } - - realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; } - realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; } - realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; } - realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; } - - real_t maxval() const - { - return vml_std::fmax((*this)[0], (*this)[1]); - } - real_t minval() const - { - return vml_std::fmin((*this)[0], (*this)[1]); - } - real_t prod() const - { - return (*this)[0] * (*this)[1]; + } + void storeu(real_t *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) + _mm_storel_pd(p, v); + else if (m.m[1]) + _mm_storeh_pd(p + 1, v); } - real_t sum() const - { + } + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff, m); + storeu(p + ioff, m); + } + + intvec_t as_int() const { return _mm_castpd_si128(v); } + intvec_t convert_int() const { + intvec_t r; + r.set_elt(0, floatprops::convert_int((*this)[0])); + r.set_elt(1, floatprops::convert_int((*this)[1])); + return r; + } + + realvec_t operator+() const { return *this; } + realvec_t operator-() const { return RV(0.0) - *this; } + + realvec_t operator+(realvec_t x) const { return _mm_add_pd(v, x.v); } + realvec_t operator-(realvec_t x) const { return _mm_sub_pd(v, x.v); } + realvec_t operator*(realvec_t x) const { return _mm_mul_pd(v, x.v); } + realvec_t operator/(realvec_t x) const { return _mm_div_pd(v, x.v); } + + realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; } + realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; } + realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; } + realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; } + + real_t maxval() const { return vml_std::fmax((*this)[0], (*this)[1]); } + real_t minval() const { return vml_std::fmin((*this)[0], (*this)[1]); } + real_t prod() const { return (*this)[0] * (*this)[1]; } + real_t sum() const { #ifdef __SSE3__ - return _mm_cvtsd_f64(_mm_hadd_pd(v, v)); + return _mm_cvtsd_f64(_mm_hadd_pd(v, v)); #else - return (*this)[0] + (*this)[1]; + return (*this)[0] + (*this)[1]; #endif - } - - - - boolvec_t operator==(realvec_t const& x) const - { - return _mm_cmpeq_pd(v, x.v); - } - boolvec_t operator!=(realvec_t const& x) const - { - return _mm_cmpneq_pd(v, x.v); - } - boolvec_t operator<(realvec_t const& x) const - { - return _mm_cmplt_pd(v, x.v); - } - boolvec_t operator<=(realvec_t const& x) const - { - return _mm_cmple_pd(v, x.v); - } - boolvec_t operator>(realvec_t const& x) const - { - return _mm_cmpgt_pd(v, x.v); - } - boolvec_t operator>=(realvec_t const& x) const - { - return _mm_cmpge_pd(v, x.v); - } - - - - realvec_t acos() const { return MF::vml_acos(*this); } - realvec_t acosh() const { return MF::vml_acosh(*this); } - realvec_t asin() const { return MF::vml_asin(*this); } - realvec_t asinh() const { return MF::vml_asinh(*this); } - realvec_t atan() const { return MF::vml_atan(*this); } - realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } - realvec_t atanh() const { return MF::vml_atanh(*this); } - realvec_t cbrt() const { return MF::vml_cbrt(*this); } - realvec_t ceil() const - { + } + + boolvec_t operator==(realvec_t const &x) const { + return _mm_cmpeq_pd(v, x.v); + } + boolvec_t operator!=(realvec_t const &x) const { + return _mm_cmpneq_pd(v, x.v); + } + boolvec_t operator<(realvec_t const &x) const { return _mm_cmplt_pd(v, x.v); } + boolvec_t operator<=(realvec_t const &x) const { + return _mm_cmple_pd(v, x.v); + } + boolvec_t operator>(realvec_t const &x) const { return _mm_cmpgt_pd(v, x.v); } + boolvec_t operator>=(realvec_t const &x) const { + return _mm_cmpge_pd(v, x.v); + } + + realvec_t acos() const { return MF::vml_acos(*this); } + realvec_t acosh() const { return MF::vml_acosh(*this); } + realvec_t asin() const { return MF::vml_asin(*this); } + realvec_t asinh() const { return MF::vml_asinh(*this); } + realvec_t atan() const { return MF::vml_atan(*this); } + realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } + realvec_t atanh() const { return MF::vml_atanh(*this); } + realvec_t cbrt() const { return MF::vml_cbrt(*this); } + realvec_t ceil() const { #ifdef __SSE4_1__ - return _mm_ceil_pd(v); + return _mm_ceil_pd(v); #else - return MF::vml_ceil(*this); + return MF::vml_ceil(*this); #endif - } - realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); } - realvec_t cos() const { return MF::vml_cos(*this); } - realvec_t cosh() const { return MF::vml_cosh(*this); } - realvec_t exp() const { return MF::vml_exp(*this); } - realvec_t exp10() const { return MF::vml_exp10(*this); } - realvec_t exp2() const { return MF::vml_exp2(*this); } - realvec_t expm1() const { return MF::vml_expm1(*this); } - realvec_t fabs() const { return MF::vml_fabs(*this); } - realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } - realvec_t floor() const - { + } + realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); } + realvec_t cos() const { return MF::vml_cos(*this); } + realvec_t cosh() const { return MF::vml_cosh(*this); } + realvec_t exp() const { return MF::vml_exp(*this); } + realvec_t exp10() const { return MF::vml_exp10(*this); } + realvec_t exp2() const { return MF::vml_exp2(*this); } + realvec_t expm1() const { return MF::vml_expm1(*this); } + realvec_t fabs() const { return MF::vml_fabs(*this); } + realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } + realvec_t floor() const { #ifdef __SSE4_1__ - return _mm_floor_pd(v); + return _mm_floor_pd(v); #else - return MF::vml_floor(*this); + return MF::vml_floor(*this); #endif - } - realvec_t fma(realvec_t y, realvec_t z) const - { - return MF::vml_fma(*this, y, z); - } - realvec_t fmax(realvec_t y) const { return _mm_max_pd(v, y.v); } - realvec_t fmin(realvec_t y) const { return _mm_min_pd(v, y.v); } - realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); } - realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); } - realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const - { + } + realvec_t fma(realvec_t y, realvec_t z) const { + return MF::vml_fma(*this, y, z); + } + realvec_t fmax(realvec_t y) const { return _mm_max_pd(v, y.v); } + realvec_t fmin(realvec_t y) const { return _mm_min_pd(v, y.v); } + realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); } + realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); } + realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { #ifdef VML_HAVE_NAN - return _mm_cmpunord_pd(v, v); + return _mm_cmpunord_pd(v, v); #else - return BV(false); + return BV(false); #endif - } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec_t log() const { return MF::vml_log(*this); } - realvec_t log10() const { return MF::vml_log10(*this); } - realvec_t log1p() const { return MF::vml_log1p(*this); } - realvec_t log2() const { return MF::vml_log2(*this); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return MF::vml_mad(*this, y, z); - } - realvec_t nextafter(realvec_t y) const - { - return MF::vml_nextafter(*this, y); - } - realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } - realvec_t rcp() const { return _mm_div_pd(_mm_set1_pd(1.0), v); } - realvec_t remainder(realvec_t y) const - { - return MF::vml_remainder(*this, y); - } - realvec_t rint() const - { + } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec_t log() const { return MF::vml_log(*this); } + realvec_t log10() const { return MF::vml_log10(*this); } + realvec_t log1p() const { return MF::vml_log1p(*this); } + realvec_t log2() const { return MF::vml_log2(*this); } + realvec_t mad(realvec_t y, realvec_t z) const { + return MF::vml_mad(*this, y, z); + } + realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); } + realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } + realvec_t rcp() const { return _mm_div_pd(_mm_set1_pd(1.0), v); } + realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); } + realvec_t rint() const { #ifdef __SSE4_1__ - return _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT); + return _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT); #else - return MF::vml_rint(*this); + return MF::vml_rint(*this); #endif - } - realvec_t round() const { return MF::vml_round(*this); } - realvec_t rsqrt() const { return MF::vml_rsqrt(*this); } - boolvec_t signbit() const { return v; } - realvec_t sin() const { return MF::vml_sin(*this); } - realvec_t sinh() const { return MF::vml_sinh(*this); } - realvec_t sqrt() const { return _mm_sqrt_pd(v); } - realvec_t tan() const { return MF::vml_tan(*this); } - realvec_t tanh() const { return MF::vml_tanh(*this); } - realvec_t trunc() const - { + } + realvec_t round() const { return MF::vml_round(*this); } + realvec_t rsqrt() const { return MF::vml_rsqrt(*this); } + boolvec_t signbit() const { return v; } + realvec_t sin() const { return MF::vml_sin(*this); } + realvec_t sinh() const { return MF::vml_sinh(*this); } + realvec_t sqrt() const { return _mm_sqrt_pd(v); } + realvec_t tan() const { return MF::vml_tan(*this); } + realvec_t tanh() const { return MF::vml_tanh(*this); } + realvec_t trunc() const { #ifdef __SSE4_1__ - return _mm_round_pd(v, _MM_FROUND_TO_ZERO); + return _mm_round_pd(v, _MM_FROUND_TO_ZERO); #else - return MF::vml_trunc(*this); + return MF::vml_trunc(*this); #endif - } - }; - - - - // boolvec definitions - - inline intvec<double,2> boolvec<double,2>::as_int() const - { - return _mm_castpd_si128(v); - } - - inline intvec<double,2> boolvec<double,2>::convert_int() const - { - //return ifthen(v, U(1), U(0)); - return lsr(as_int(), bits-1); - } - - inline - boolvec<double,2> boolvec<double,2>::ifthen(boolvec_t x, boolvec_t y) const - { - return ifthen(x.as_int(), y.as_int()).as_bool(); - } - - inline - intvec<double,2> boolvec<double,2>::ifthen(intvec_t x, intvec_t y) const - { - return ifthen(x.as_float(), y.as_float()).as_int(); - } - - inline - realvec<double,2> boolvec<double,2>::ifthen(realvec_t x, realvec_t y) const - { + } +}; + +// boolvec definitions + +inline intvec<double, 2> boolvec<double, 2>::as_int() const { + return _mm_castpd_si128(v); +} + +inline intvec<double, 2> boolvec<double, 2>::convert_int() const { + // return ifthen(v, U(1), U(0)); + return lsr(as_int(), bits - 1); +} + +inline boolvec<double, 2> boolvec<double, 2>::ifthen(boolvec_t x, + boolvec_t y) const { + return ifthen(x.as_int(), y.as_int()).as_bool(); +} + +inline intvec<double, 2> boolvec<double, 2>::ifthen(intvec_t x, + intvec_t y) const { + return ifthen(x.as_float(), y.as_float()).as_int(); +} + +inline realvec<double, 2> boolvec<double, 2>::ifthen(realvec_t x, + realvec_t y) const { #ifdef __SSE4_1__ - return _mm_blendv_pd(y.v, x.v, v); + return _mm_blendv_pd(y.v, x.v, v); #else - return (( -convert_int() & x.as_int()) | - (~-convert_int() & y.as_int())).as_float(); + return ((-convert_int() & x.as_int()) | (~ - convert_int() & y.as_int())) + .as_float(); #endif - } - - - - // intvec definitions - - inline realvec<double,2> intvec<double,2>::as_float() const - { - return _mm_castsi128_pd(v); - } - - inline realvec<double,2> intvec<double,2>::convert_float() const - { - realvec_t r; - r.set_elt(0, floatprops::convert_float((*this)[0])); - r.set_elt(1, floatprops::convert_float((*this)[1])); - return r; - } - - inline intvec<double,2> intvec<double,2>::abs() const - { - return MF::vml_abs(*this); - } - - inline intvec<double,2> intvec<double,2>::bitifthen(intvec_t x, - intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - inline intvec<double,2> intvec<double,2>::clz() const - { - return MF::vml_clz(*this); - } - - inline intvec<double,2> intvec<double,2>::max(intvec_t x) const - { - return MF::vml_max(*this, x); - } - - inline intvec<double,2> intvec<double,2>::min(intvec_t x) const - { - return MF::vml_min(*this, x); - } - - inline intvec<double,2> intvec<double,2>::popcount() const - { - return MF::vml_popcount(*this); - } - - inline intvec<double,2> intvec<double,2>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); - } - - inline intvec<double,2> intvec<double,2>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); - } - +} + +// intvec definitions + +inline realvec<double, 2> intvec<double, 2>::as_float() const { + return _mm_castsi128_pd(v); +} + +inline realvec<double, 2> intvec<double, 2>::convert_float() const { + realvec_t r; + r.set_elt(0, floatprops::convert_float((*this)[0])); + r.set_elt(1, floatprops::convert_float((*this)[1])); + return r; +} + +inline intvec<double, 2> intvec<double, 2>::abs() const { + return MF::vml_abs(*this); +} + +inline intvec<double, 2> intvec<double, 2>::bitifthen(intvec_t x, + intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); +} + +inline intvec<double, 2> intvec<double, 2>::clz() const { + return MF::vml_clz(*this); +} + +inline intvec<double, 2> intvec<double, 2>::max(intvec_t x) const { + return MF::vml_max(*this, x); +} + +inline intvec<double, 2> intvec<double, 2>::min(intvec_t x) const { + return MF::vml_min(*this, x); +} + +inline intvec<double, 2> intvec<double, 2>::popcount() const { + return MF::vml_popcount(*this); +} + +inline intvec<double, 2> intvec<double, 2>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +inline intvec<double, 2> intvec<double, 2>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + } // namespace vecmathlib -#endif // #ifndef VEC_SSE_DOUBLE2_H +#endif // #ifndef VEC_SSE_DOUBLE2_H diff --git a/vec_sse_float1.h b/vec_sse_float1.h index 9cee891..a84a046 100644 --- a/vec_sse_float1.h +++ b/vec_sse_float1.h @@ -12,583 +12,489 @@ // SSE2 intrinsics #include <emmintrin.h> -#ifdef __SSE3__ // Intel's SSE 3 -# include <pmmintrin.h> +#ifdef __SSE3__ // Intel's SSE 3 +#include <pmmintrin.h> #endif -#ifdef __SSE4_1__ // Intel's SSE 4.1 -# include <smmintrin.h> +#ifdef __SSE4_1__ // Intel's SSE 4.1 +#include <smmintrin.h> #endif -#ifdef __SSE4A__ // AMD's SSE 4a -# include <ammintrin.h> +#ifdef __SSE4A__ // AMD's SSE 4a +#include <ammintrin.h> #endif -#if defined __AVX__ // Intel's AVX -# include <immintrin.h> +#if defined __AVX__ // Intel's AVX +#include <immintrin.h> #endif - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_FLOAT_1 - template<> struct boolvec<float,1>; - template<> struct intvec<float,1>; - template<> struct realvec<float,1>; - - - - template<> - struct boolvec<float,1>: floatprops<float> - { - static int const size = 1; - typedef bool scalar_t; - typedef uint_t bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - // true values are non-zero, false values are zero - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(a) {} - boolvec(bool const* as): v(as[0]) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const { return v; } - boolvec_t& set_elt(int n, bool a) { return v=a, *this; } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec_t operator!() const { return !v; } - - boolvec_t operator&&(boolvec_t x) const { return v && x.v; } - boolvec_t operator||(boolvec_t x) const { return v || x.v; } - boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); } - boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); } - - bool all() const { return *this; } - bool any() const { return *this; } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<float,1>: floatprops<float> - { - static int const size = 1; - typedef int_t scalar_t; - typedef int_t ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(int_t a): v(a) {} - intvec(int_t const* as): v(as[0]) {} - static intvec_t iota() { return intvec(I(0)); } - - operator ivector_t() const { return v; } - int_t operator[](int n) const { return v; } - intvec_t& set_elt(int n, int_t a) { return v=a, *this; } - - - - boolvec_t as_bool() const { return U(v); } - boolvec_t convert_bool() const { return bool(v); } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - intvec_t operator+() const { return +v; } - intvec_t operator-() const { return -v; } - - intvec_t operator+(intvec_t x) const { return v+x.v; } - intvec_t operator-(intvec_t x) const { return v-x.v; } - intvec_t operator*(intvec_t x) const { return v*x.v; } - intvec_t operator/(intvec_t x) const { return v/x.v; } - intvec_t operator%(intvec_t x) const { return v%x.v; } - - intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; } - intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; } - intvec_t& operator*=(intvec_t const& x) { return *this=*this*x; } - intvec_t& operator/=(intvec_t const& x) { return *this=*this/x; } - intvec_t& operator%=(intvec_t const& x) { return *this=*this%x; } - - - - intvec_t operator~() const { return ~v; } - - intvec_t operator&(intvec_t x) const { return v&x.v; } - intvec_t operator|(intvec_t x) const { return v|x.v; } - intvec_t operator^(intvec_t x) const { return v^x.v; } - - intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; } - intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; } - intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const; - - - - intvec_t lsr(int_t n) const { return U(v) >> U(n); } - intvec_t rotate(int_t n) const; - intvec_t operator>>(int_t n) const { return v>>n; } - intvec_t operator<<(int_t n) const { return v<<n; } - - intvec_t& operator>>=(int_t n) { return *this=*this>>n; } - intvec_t& operator<<=(int_t n) { return *this=*this<<n; } - - intvec_t lsr(intvec_t n) const { return U(v) >> U(n); } - intvec_t rotate(intvec_t n) const; - intvec_t operator>>(intvec_t n) const { return v>>n; } - intvec_t operator<<(intvec_t n) const { return v<<n; } - - intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; } - intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; } - - intvec_t clz() const { return __builtin_clz(v); } - intvec_t popcount() const { return __builtin_popcount(v); } - - - - boolvec_t operator==(intvec_t const& x) const { return v==x.v; } - boolvec_t operator!=(intvec_t const& x) const { return v!=x.v; } - boolvec_t operator<(intvec_t const& x) const { return v<x.v; } - boolvec_t operator<=(intvec_t const& x) const { return v<=x.v; } - boolvec_t operator>(intvec_t const& x) const { return v>x.v; } - boolvec_t operator>=(intvec_t const& x) const { return v>=x.v; } - - intvec_t abs() const { return std::abs(v); } - boolvec_t isignbit() const { return v<0; } - intvec_t max(intvec_t x) const { return std::max(v, x.v); } - intvec_t min(intvec_t x) const { return std::min(v, x.v); } - }; - - - - template<> - struct realvec<float,1>: floatprops<float> - { - static int const size = 1; - typedef real_t scalar_t; - typedef float vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return "<SSE2:1*float>"; } - void barrier() { __asm__("": "+x"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - private: - static __m128 from_float(float a) { return _mm_set_ss(a); } - static float to_float(__m128 a) { return _mm_cvtss_f32(a); } - public: - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(real_t a): v(a) {} - realvec(real_t const* as): v(as[0]) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const { return v; } - realvec_t& set_elt(int n, real_t a) { return v=a, *this; } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return *p; - } - static realvec_t loadu(real_t const* p) - { - return *p; - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loada(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return *this; - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return *this; - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loada(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - *p = v; - } - void storeu(real_t* p) const - { - *p = v; - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storea(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } +template <> struct boolvec<float, 1>; +template <> struct intvec<float, 1>; +template <> struct realvec<float, 1>; + +template <> struct boolvec<float, 1> : floatprops<float> { + static int const size = 1; + typedef bool scalar_t; + typedef uint_t bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + // true values are non-zero, false values are zero + + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v(a) {} + boolvec(bool const *as) : v(as[0]) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return v; } + boolvec_t &set_elt(int n, bool a) { return v = a, *this; } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec_t operator!() const { return !v; } + + boolvec_t operator&&(boolvec_t x) const { return v && x.v; } + boolvec_t operator||(boolvec_t x) const { return v || x.v; } + boolvec_t operator==(boolvec_t x) const { return bool(v) == bool(x.v); } + boolvec_t operator!=(boolvec_t x) const { return bool(v) != bool(x.v); } + + bool all() const { return *this; } + bool any() const { return *this; } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<float, 1> : floatprops<float> { + static int const size = 1; + typedef int_t scalar_t; + typedef int_t ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(int_t a) : v(a) {} + intvec(int_t const *as) : v(as[0]) {} + static intvec_t iota() { return intvec(I(0)); } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return v; } + intvec_t &set_elt(int n, int_t a) { return v = a, *this; } + + boolvec_t as_bool() const { return U(v); } + boolvec_t convert_bool() const { return bool(v); } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + intvec_t operator+() const { return +v; } + intvec_t operator-() const { return -v; } + + intvec_t operator+(intvec_t x) const { return v + x.v; } + intvec_t operator-(intvec_t x) const { return v - x.v; } + intvec_t operator*(intvec_t x) const { return v * x.v; } + intvec_t operator/(intvec_t x) const { return v / x.v; } + intvec_t operator%(intvec_t x) const { return v % x.v; } + + intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; } + intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; } + intvec_t &operator*=(intvec_t const &x) { return *this = *this * x; } + intvec_t &operator/=(intvec_t const &x) { return *this = *this / x; } + intvec_t &operator%=(intvec_t const &x) { return *this = *this % x; } + + intvec_t operator~() const { return ~v; } + + intvec_t operator&(intvec_t x) const { return v & x.v; } + intvec_t operator|(intvec_t x) const { return v | x.v; } + intvec_t operator^(intvec_t x) const { return v ^ x.v; } + + intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; } + intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; } + intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const; + + intvec_t lsr(int_t n) const { return U(v) >> U(n); } + intvec_t rotate(int_t n) const; + intvec_t operator>>(int_t n) const { return v >> n; } + intvec_t operator<<(int_t n) const { return v << n; } + + intvec_t &operator>>=(int_t n) { return *this = *this >> n; } + intvec_t &operator<<=(int_t n) { return *this = *this << n; } + + intvec_t lsr(intvec_t n) const { return U(v) >> U(n); } + intvec_t rotate(intvec_t n) const; + intvec_t operator>>(intvec_t n) const { return v >> n; } + intvec_t operator<<(intvec_t n) const { return v << n; } + + intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; } + intvec_t &operator<<=(intvec_t n) { return *this = *this << n; } + + intvec_t clz() const { return __builtin_clz(v); } + intvec_t popcount() const { return __builtin_popcount(v); } + + boolvec_t operator==(intvec_t const &x) const { return v == x.v; } + boolvec_t operator!=(intvec_t const &x) const { return v != x.v; } + boolvec_t operator<(intvec_t const &x) const { return v < x.v; } + boolvec_t operator<=(intvec_t const &x) const { return v <= x.v; } + boolvec_t operator>(intvec_t const &x) const { return v > x.v; } + boolvec_t operator>=(intvec_t const &x) const { return v >= x.v; } + + intvec_t abs() const { return std::abs(v); } + boolvec_t isignbit() const { return v < 0; } + intvec_t max(intvec_t x) const { return std::max(v, x.v); } + intvec_t min(intvec_t x) const { return std::min(v, x.v); } +}; + +template <> struct realvec<float, 1> : floatprops<float> { + static int const size = 1; + typedef real_t scalar_t; + typedef float vector_t; + static int const alignment = sizeof(vector_t); + + static char const *name() { return "<SSE2:1*float>"; } + void barrier() { __asm__("" : "+x"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + +private: + static __m128 from_float(float a) { return _mm_set_ss(a); } + static float to_float(__m128 a) { return _mm_cvtss_f32(a); } + +public: + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(real_t a) : v(a) {} + realvec(real_t const *as) : v(as[0]) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { return v; } + realvec_t &set_elt(int n, real_t a) { return v = a, *this; } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return *p; + } + static realvec_t loadu(real_t const *p) { return *p; } + static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loada(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return *this; } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return *this; } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storea(p+ioff, m); + } + realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loada(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + *p = v; + } + void storeu(real_t *p) const { *p = v; } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storea(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); } - - - - intvec_t as_int() const { return floatprops::as_int(v); } - intvec_t convert_int() const { - // return floatprops::convert_int(v); - return _mm_cvttss_si32(_mm_set_ss(v)); + } + void storeu(real_t *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); } - - - - realvec_t operator+() const { return +v; } - realvec_t operator-() const { return -v; } - - realvec_t operator+(realvec_t x) const { return v+x.v; } - realvec_t operator-(realvec_t x) const { return v-x.v; } - realvec_t operator*(realvec_t x) const { return v*x.v; } - realvec_t operator/(realvec_t x) const { return v/x.v; } - - realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; } - realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; } - realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; } - realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; } - - real_t maxval() const { return *this; } - real_t minval() const { return *this; } - real_t prod() const { return *this; } - real_t sum() const { return *this; } - - - - boolvec_t operator==(realvec_t const& x) const { return v==x.v; } - boolvec_t operator!=(realvec_t const& x) const { return v!=x.v; } - boolvec_t operator<(realvec_t const& x) const { return v<x.v; } - boolvec_t operator<=(realvec_t const& x) const { return v<=x.v; } - boolvec_t operator>(realvec_t const& x) const { return v>x.v; } - boolvec_t operator>=(realvec_t const& x) const { return v>=x.v; } - - - - realvec_t acos() const { return MF::vml_acos(*this); } - realvec_t acosh() const { return MF::vml_acosh(*this); } - realvec_t asin() const { return MF::vml_asin(*this); } - realvec_t asinh() const { return MF::vml_asinh(*this); } - realvec_t atan() const { return MF::vml_atan(*this); } - realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } - realvec_t atanh() const { return MF::vml_atanh(*this); } - realvec_t cbrt() const { return MF::vml_cbrt(*this); } - realvec_t ceil() const - { + } + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storea(p + ioff, m); + } + + intvec_t as_int() const { return floatprops::as_int(v); } + intvec_t convert_int() const { + // return floatprops::convert_int(v); + return _mm_cvttss_si32(_mm_set_ss(v)); + } + + realvec_t operator+() const { return +v; } + realvec_t operator-() const { return -v; } + + realvec_t operator+(realvec_t x) const { return v + x.v; } + realvec_t operator-(realvec_t x) const { return v - x.v; } + realvec_t operator*(realvec_t x) const { return v * x.v; } + realvec_t operator/(realvec_t x) const { return v / x.v; } + + realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; } + realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; } + realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; } + realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; } + + real_t maxval() const { return *this; } + real_t minval() const { return *this; } + real_t prod() const { return *this; } + real_t sum() const { return *this; } + + boolvec_t operator==(realvec_t const &x) const { return v == x.v; } + boolvec_t operator!=(realvec_t const &x) const { return v != x.v; } + boolvec_t operator<(realvec_t const &x) const { return v < x.v; } + boolvec_t operator<=(realvec_t const &x) const { return v <= x.v; } + boolvec_t operator>(realvec_t const &x) const { return v > x.v; } + boolvec_t operator>=(realvec_t const &x) const { return v >= x.v; } + + realvec_t acos() const { return MF::vml_acos(*this); } + realvec_t acosh() const { return MF::vml_acosh(*this); } + realvec_t asin() const { return MF::vml_asin(*this); } + realvec_t asinh() const { return MF::vml_asinh(*this); } + realvec_t atan() const { return MF::vml_atan(*this); } + realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } + realvec_t atanh() const { return MF::vml_atanh(*this); } + realvec_t cbrt() const { return MF::vml_cbrt(*this); } + realvec_t ceil() const { #ifdef __SSE4_1__ - return to_float(_mm_ceil_ss(from_float(v), from_float(v))); + return to_float(_mm_ceil_ss(from_float(v), from_float(v))); #else - return vml_std::ceil(v); + return vml_std::ceil(v); #endif - } - realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); } - realvec_t cos() const { return MF::vml_cos(*this); } - realvec_t cosh() const { return MF::vml_cosh(*this); } - realvec_t exp() const { return MF::vml_exp(*this); } - realvec_t exp10() const { return MF::vml_exp10(*this); } - realvec_t exp2() const { return MF::vml_exp2(*this); } - realvec_t expm1() const { return MF::vml_expm1(*this); } - realvec_t fabs() const { return vml_std::fabs(v); } - realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } - realvec_t floor() const - { + } + realvec_t copysign(realvec_t y) const { return vml_std::copysign(v, y.v); } + realvec_t cos() const { return MF::vml_cos(*this); } + realvec_t cosh() const { return MF::vml_cosh(*this); } + realvec_t exp() const { return MF::vml_exp(*this); } + realvec_t exp10() const { return MF::vml_exp10(*this); } + realvec_t exp2() const { return MF::vml_exp2(*this); } + realvec_t expm1() const { return MF::vml_expm1(*this); } + realvec_t fabs() const { return vml_std::fabs(v); } + realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } + realvec_t floor() const { #ifdef __SSE4_1__ - return to_float(_mm_floor_ss(from_float(v), from_float(v))); + return to_float(_mm_floor_ss(from_float(v), from_float(v))); #else - return vml_std::floor(v); + return vml_std::floor(v); #endif - } - realvec_t fma(realvec_t y, realvec_t z) const - { - return MF::vml_fma(*this, y, z); - } - realvec_t fmax(realvec_t y) const - { - return to_float(_mm_max_ss(from_float(v), from_float(y.v))); - } - realvec_t fmin(realvec_t y) const - { - return to_float(_mm_min_ss(from_float(v), from_float(y.v))); - } - realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); } - realvec_t frexp(intvec_t* irp) const - { - int iri; - realvec_t r = vml_std::frexp(v, &iri); - int_t ir = iri; - if (isinf()) ir = std::numeric_limits<int_t>::max(); - if (isnan()) ir = std::numeric_limits<int_t>::min(); - irp->v = ir; - return r; - } - realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const - { - int_t r = vml_std::ilogb(v); - typedef std::numeric_limits<int_t> NL; - if (FP_ILOGB0 != NL::min() and *this == RV(R(0.0))) { - r = NL::min(); + } + realvec_t fma(realvec_t y, realvec_t z) const { + return MF::vml_fma(*this, y, z); + } + realvec_t fmax(realvec_t y) const { + return to_float(_mm_max_ss(from_float(v), from_float(y.v))); + } + realvec_t fmin(realvec_t y) const { + return to_float(_mm_min_ss(from_float(v), from_float(y.v))); + } + realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); } + realvec_t frexp(intvec_t *irp) const { + int iri; + realvec_t r = vml_std::frexp(v, &iri); + int_t ir = iri; + if (isinf()) + ir = std::numeric_limits<int_t>::max(); + if (isnan()) + ir = std::numeric_limits<int_t>::min(); + irp->v = ir; + return r; + } + realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { + int_t r = vml_std::ilogb(v); + typedef std::numeric_limits<int_t> NL; + if (FP_ILOGB0 != NL::min() and *this == RV(R(0.0))) { + r = NL::min(); #if defined VML_HAVE_INF - } else if (INT_MAX != NL::max() and vml_std::isinf(v)) { - r = NL::max(); + } else if (INT_MAX != NL::max() and vml_std::isinf(v)) { + r = NL::max(); #endif #if defined VML_HAVE_NAN - } else if (FP_ILOGBNAN != NL::min() and isnan()) { - r = NL::min(); + } else if (FP_ILOGBNAN != NL::min() and isnan()) { + r = NL::min(); #endif - } - return r; } - boolvec_t isfinite() const { return vml_std::isfinite(v); } - boolvec_t isinf() const { return vml_std::isinf(v); } - boolvec_t isnan() const - { + return r; + } + boolvec_t isfinite() const { return vml_std::isfinite(v); } + boolvec_t isinf() const { return vml_std::isinf(v); } + boolvec_t isnan() const { #if defined VML_HAVE_NAN - // This is wrong: - // return _mm_ucomineq_ss(from_float(v), from_float(v)); - // This works: - // char r; - // __asm__("ucomiss %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v)); - // return boolvec_t::scalar_t(r); - // This works as well: - return vml_std::isnan(v); + // This is wrong: + // return _mm_ucomineq_ss(from_float(v), from_float(v)); + // This works: + // char r; + // __asm__("ucomiss %[v],%[v]; setp %[r]": [r]"=q"(r): [v]"x"(v)); + // return boolvec_t::scalar_t(r); + // This works as well: + return vml_std::isnan(v); #else - return BV(false); + return BV(false); #endif - } - boolvec_t isnormal() const { return vml_std::isnormal(v); } - realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); } - realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); } - realvec_t log() const { return MF::vml_log(*this); } - realvec_t log10() const { return MF::vml_log10(*this); } - realvec_t log1p() const { return MF::vml_log1p(*this); } - realvec_t log2() const { return MF::vml_log2(*this); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return MF::vml_mad(*this, y, z); - } - realvec_t nextafter(realvec_t y) const - { - return MF::vml_nextafter(*this, y); - } - realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } - realvec_t rcp() const { return R(1.0)/v; } - realvec_t remainder(realvec_t y) const - { - return vml_std::remainder(v, y.v); - } - realvec_t rint() const - { + } + boolvec_t isnormal() const { return vml_std::isnormal(v); } + realvec_t ldexp(int_t n) const { return vml_std::ldexp(v, n); } + realvec_t ldexp(intvec_t n) const { return vml_std::ldexp(v, n); } + realvec_t log() const { return MF::vml_log(*this); } + realvec_t log10() const { return MF::vml_log10(*this); } + realvec_t log1p() const { return MF::vml_log1p(*this); } + realvec_t log2() const { return MF::vml_log2(*this); } + realvec_t mad(realvec_t y, realvec_t z) const { + return MF::vml_mad(*this, y, z); + } + realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); } + realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } + realvec_t rcp() const { return R(1.0) / v; } + realvec_t remainder(realvec_t y) const { return vml_std::remainder(v, y.v); } + realvec_t rint() const { #ifdef __SSE4_1__ - return to_float(_mm_round_ss(from_float(v), from_float(v), - _MM_FROUND_TO_NEAREST_INT)); + return to_float( + _mm_round_ss(from_float(v), from_float(v), _MM_FROUND_TO_NEAREST_INT)); #else - return MF::vml_rint(*this); + return MF::vml_rint(*this); #endif - } - realvec_t round() const { return MF::vml_round(*this); } - realvec_t rsqrt() const { return MF::vml_rsqrt(*this); } - boolvec_t signbit() const { return vml_std::signbit(v); } - realvec_t sin() const { return MF::vml_sin(*this); } - realvec_t sinh() const { return MF::vml_sinh(*this); } - realvec_t sqrt() const { return to_float(_mm_sqrt_ss(from_float(v))); } - realvec_t tan() const { return MF::vml_tan(*this); } - realvec_t tanh() const { return MF::vml_tanh(*this); } - realvec_t trunc() const - { + } + realvec_t round() const { return MF::vml_round(*this); } + realvec_t rsqrt() const { return MF::vml_rsqrt(*this); } + boolvec_t signbit() const { return vml_std::signbit(v); } + realvec_t sin() const { return MF::vml_sin(*this); } + realvec_t sinh() const { return MF::vml_sinh(*this); } + realvec_t sqrt() const { return to_float(_mm_sqrt_ss(from_float(v))); } + realvec_t tan() const { return MF::vml_tan(*this); } + realvec_t tanh() const { return MF::vml_tanh(*this); } + realvec_t trunc() const { #ifdef __SSE4_1__ - return to_float(_mm_round_ss(from_float(v), from_float(v), - _MM_FROUND_TO_ZERO)); + return to_float( + _mm_round_ss(from_float(v), from_float(v), _MM_FROUND_TO_ZERO)); #else - return MF::vml_trunc(*this); + return MF::vml_trunc(*this); #endif - } - }; - - - - // boolvec definitions - - inline intvec<float,1> boolvec<float,1>::as_int() const - { - return I(v); - } - - inline intvec<float,1> boolvec<float,1>::convert_int() const - { - return v; - } - - inline - boolvec<float,1> boolvec<float,1>::ifthen(boolvec_t x, boolvec_t y) const - { - return v ? x : y; - } - - inline intvec<float,1> boolvec<float,1>::ifthen(intvec_t x, intvec_t y) const - { - return v ? x : y; - } - - inline - realvec<float,1> boolvec<float,1>::ifthen(realvec_t x, realvec_t y) const - { - return v ? x : y; - } - - - - // intvec definitions - - inline realvec<float,1> intvec<float,1>::as_float() const - { - return FP::as_float(v); - } - - inline intvec<float,1> intvec<float,1>::bitifthen(intvec_t x, - intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - inline realvec<float,1> intvec<float,1>::convert_float() const - { - // return FP::convert_float(v); - return _mm_cvtss_f32(_mm_cvtsi32_ss(_mm_setzero_ps(), v)); } - - inline intvec<float,1> intvec<float,1>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); - } - - inline intvec<float,1> intvec<float,1>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); - } - +}; + +// boolvec definitions + +inline intvec<float, 1> boolvec<float, 1>::as_int() const { return I(v); } + +inline intvec<float, 1> boolvec<float, 1>::convert_int() const { return v; } + +inline boolvec<float, 1> boolvec<float, 1>::ifthen(boolvec_t x, + boolvec_t y) const { + return v ? x : y; +} + +inline intvec<float, 1> boolvec<float, 1>::ifthen(intvec_t x, + intvec_t y) const { + return v ? x : y; +} + +inline realvec<float, 1> boolvec<float, 1>::ifthen(realvec_t x, + realvec_t y) const { + return v ? x : y; +} + +// intvec definitions + +inline realvec<float, 1> intvec<float, 1>::as_float() const { + return FP::as_float(v); +} + +inline intvec<float, 1> intvec<float, 1>::bitifthen(intvec_t x, + intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); +} + +inline realvec<float, 1> intvec<float, 1>::convert_float() const { + // return FP::convert_float(v); + return _mm_cvtss_f32(_mm_cvtsi32_ss(_mm_setzero_ps(), v)); +} + +inline intvec<float, 1> intvec<float, 1>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +inline intvec<float, 1> intvec<float, 1>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + } // namespace vecmathlib -#endif // #ifndef VEC_SSE_FLOAT1_H +#endif // #ifndef VEC_SSE_FLOAT1_H diff --git a/vec_sse_float4.h b/vec_sse_float4.h index 940de67..f8e8e80 100644 --- a/vec_sse_float4.h +++ b/vec_sse_float4.h @@ -11,766 +11,642 @@ // SSE2 intrinsics #include <xmmintrin.h> -#ifdef __SSE3__ // Intel's SSE 3 -# include <pmmintrin.h> +#ifdef __SSE3__ // Intel's SSE 3 +#include <pmmintrin.h> #endif -#ifdef __SSSE3__ // Intel's SSSE 3 -# include <tmmintrin.h> +#ifdef __SSSE3__ // Intel's SSSE 3 +#include <tmmintrin.h> #endif -#if defined __SSE4_1__ // Intel's SSE 4.1 -# include <smmintrin.h> +#if defined __SSE4_1__ // Intel's SSE 4.1 +#include <smmintrin.h> #endif -#if defined __SSE4A__ // AMD's SSE 4a -# include <ammintrin.h> +#if defined __SSE4A__ // AMD's SSE 4a +#include <ammintrin.h> #endif -#if defined __AVX__ // Intel's AVX -# include <immintrin.h> +#if defined __AVX__ // Intel's AVX +#include <immintrin.h> #endif - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_FLOAT_4 - template<> struct boolvec<float,4>; - template<> struct intvec<float,4>; - template<> struct realvec<float,4>; - - - - template<> - struct boolvec<float,4>: floatprops<float> - { - static int const size = 4; - typedef bool scalar_t; - typedef __m128 bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values have the sign bit set, false values have it unset - static uint_t from_bool(bool a) { return - int_t(a); } - static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } - public: - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): - v(_mm_castsi128_ps(_mm_set1_epi32(from_bool(a)))) {} - boolvec(bool const* as): - v(_mm_castsi128_ps(_mm_set_epi32(from_bool(as[3]), - from_bool(as[2]), - from_bool(as[1]), - from_bool(as[0])))) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const - { - return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n)); - } - boolvec_t& set_elt(int n, bool a) - { - return - vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec_t operator!() const { return _mm_xor_ps(boolvec(true), v); } - - boolvec_t operator&&(boolvec_t x) const { return _mm_and_ps(v, x.v); } - boolvec_t operator||(boolvec_t x) const { return _mm_or_ps(v, x.v); } - boolvec_t operator==(boolvec_t x) const { return !(*this!=x); } - boolvec_t operator!=(boolvec_t x) const { return _mm_xor_ps(v, x.v); } - - bool all() const - { - // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3]; +template <> struct boolvec<float, 4>; +template <> struct intvec<float, 4>; +template <> struct realvec<float, 4>; + +template <> struct boolvec<float, 4> : floatprops<float> { + static int const size = 4; + typedef bool scalar_t; + typedef __m128 bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + +private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return -int_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + +public: + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) : v(_mm_castsi128_ps(_mm_set1_epi32(from_bool(a)))) {} + boolvec(bool const *as) + : v(_mm_castsi128_ps(_mm_set_epi32(from_bool(as[3]), from_bool(as[2]), + from_bool(as[1]), from_bool(as[0])))) { + } + + operator bvector_t() const { return v; } + bool operator[](int n) const { + return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n)); + } + boolvec_t &set_elt(int n, bool a) { + return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)), + *this; + } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec_t operator!() const { return _mm_xor_ps(boolvec(true), v); } + + boolvec_t operator&&(boolvec_t x) const { return _mm_and_ps(v, x.v); } + boolvec_t operator||(boolvec_t x) const { return _mm_or_ps(v, x.v); } + boolvec_t operator==(boolvec_t x) const { return !(*this != x); } + boolvec_t operator!=(boolvec_t x) const { return _mm_xor_ps(v, x.v); } + + bool all() const { +// return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3]; #if defined __AVX__ - return ! (! *this).any(); + return !(!*this).any(); #else - boolvec_t x = *this; - x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1)); - return x[0] && x[2]; + boolvec_t x = *this; + x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1)); + return x[0] && x[2]; #endif - } - bool any() const - { - // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3]; + } + bool any() const { +// return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3]; #if defined __AVX__ - return ! bool(_mm_testz_ps(v, v)); + return !bool(_mm_testz_ps(v, v)); #else - boolvec_t x = *this; - x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1)); - return x[0] || x[2]; + boolvec_t x = *this; + x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1)); + return x[0] || x[2]; #endif + } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<float, 4> : floatprops<float> { + static int const size = 4; + typedef int_t scalar_t; + typedef __m128i ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x) : v(x) {} + intvec(int_t a) : v(_mm_set1_epi32(a)) {} + intvec(int_t const *as) : v(_mm_set_epi32(as[3], as[2], as[1], as[0])) {} + static intvec_t iota() { return _mm_set_epi32(3, 2, 1, 0); } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { + return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n); + } + intvec_t &set_elt(int n, int_t a) { + return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this; + } + + boolvec_t as_bool() const { return _mm_castsi128_ps(v); } + boolvec_t convert_bool() const { + // Result: convert_bool(0)=false, convert_bool(else)=true + return !IV(_mm_cmpeq_epi32(v, IV(0))).as_bool(); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + // Note: not all arithmetic operations are supported! + + intvec_t operator+() const { return *this; } + intvec_t operator-() const { return IV(0) - *this; } + + intvec_t operator+(intvec_t x) const { return _mm_add_epi32(v, x.v); } + intvec_t operator-(intvec_t x) const { return _mm_sub_epi32(v, x.v); } + + intvec_t &operator+=(intvec_t const &x) { return *this = *this + x; } + intvec_t &operator-=(intvec_t const &x) { return *this = *this - x; } + + intvec_t operator~() const { return IV(~U(0)) ^ *this; } + + intvec_t operator&(intvec_t x) const { + return _mm_castps_si128( + _mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(x.v))); + } + intvec_t operator|(intvec_t x) const { + return _mm_castps_si128( + _mm_or_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(x.v))); + } + intvec_t operator^(intvec_t x) const { + return _mm_castps_si128( + _mm_xor_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(x.v))); + } + + intvec_t &operator&=(intvec_t const &x) { return *this = *this & x; } + intvec_t &operator|=(intvec_t const &x) { return *this = *this | x; } + intvec_t &operator^=(intvec_t const &x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const; + + intvec_t lsr(int_t n) const { return _mm_srli_epi32(v, n); } + intvec_t rotate(int_t n) const; + intvec_t operator>>(int_t n) const { return _mm_srai_epi32(v, n); } + intvec_t operator<<(int_t n) const { return _mm_slli_epi32(v, n); } + intvec_t &operator>>=(int_t n) { return *this = *this >> n; } + intvec_t &operator<<=(int_t n) { return *this = *this << n; } + + intvec_t lsr(intvec_t n) const { + intvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, U((*this)[i]) >> U(n[i])); } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<float,4>: floatprops<float> - { - static int const size = 4; - typedef int_t scalar_t; - typedef __m128i ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm_set1_epi32(a)) {} - intvec(int_t const* as): v(_mm_set_epi32(as[3], as[2], as[1], as[0])) {} - static intvec_t iota() { return _mm_set_epi32(3, 2, 1, 0); } - - operator ivector_t() const { return v; } - int_t operator[](int n) const - { - return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n); - } - intvec_t& set_elt(int n, int_t a) - { - return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this; - } - - - - boolvec_t as_bool() const { return _mm_castsi128_ps(v); } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true - return ! IV(_mm_cmpeq_epi32(v, IV(0))).as_bool(); - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec_t operator+() const { return *this; } - intvec_t operator-() const { return IV(0) - *this; } - - intvec_t operator+(intvec_t x) const { return _mm_add_epi32(v, x.v); } - intvec_t operator-(intvec_t x) const { return _mm_sub_epi32(v, x.v); } - - intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; } - intvec_t& operator-=(intvec_t const& x) { return *this=*this-x; } - - - - intvec_t operator~() const { return IV(~U(0)) ^ *this; } - - intvec_t operator&(intvec_t x) const - { - return _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), - _mm_castsi128_ps(x.v))); - } - intvec_t operator|(intvec_t x) const - { - return _mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(v), - _mm_castsi128_ps(x.v))); - } - intvec_t operator^(intvec_t x) const - { - return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(v), - _mm_castsi128_ps(x.v))); - } - - intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; } - intvec_t& operator|=(intvec_t const& x) { return *this=*this|x; } - intvec_t& operator^=(intvec_t const& x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const; - - - - intvec_t lsr(int_t n) const { return _mm_srli_epi32(v, n); } - intvec_t rotate(int_t n) const; - intvec_t operator>>(int_t n) const { return _mm_srai_epi32(v, n); } - intvec_t operator<<(int_t n) const { return _mm_slli_epi32(v, n); } - intvec_t& operator>>=(int_t n) { return *this=*this>>n; } - intvec_t& operator<<=(int_t n) { return *this=*this<<n; } - - intvec_t lsr(intvec_t n) const - { - intvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, U((*this)[i]) >> U(n[i])); - } - return r; - } - intvec_t rotate(intvec_t n) const; - intvec_t operator>>(intvec_t n) const - { - intvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] >> n[i]); - } - return r; - } - intvec_t operator<<(intvec_t n) const - { - intvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] << n[i]); - } - return r; - } - intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; } - intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; } - - intvec_t clz() const; - intvec_t popcount() const; - - - - boolvec_t operator==(intvec_t const& x) const - { - return ! (*this != x); - } - boolvec_t operator!=(intvec_t const& x) const - { - return (*this ^ x).convert_bool(); - } - boolvec_t operator<(intvec_t const& x) const - { - // return (*this - x).as_bool(); - boolvec_t r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] < x[i]); - } - return r; - } - boolvec_t operator<=(intvec_t const& x) const - { - return ! (*this > x); - } - boolvec_t operator>(intvec_t const& x) const - { - return x < *this; - } - boolvec_t operator>=(intvec_t const& x) const - { - return ! (*this < x); - } - - intvec_t abs() const; - boolvec_t isignbit() const { return as_bool(); } - intvec_t max(intvec_t x) const; - intvec_t min(intvec_t x) const; - }; - - - - template<> - struct realvec<float,4>: floatprops<float> - { - static int const size = 4; - typedef real_t scalar_t; - typedef __m128 vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return "<SSE2:4*float>"; } - void barrier() { __asm__("": "+x"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm_set1_ps(a)) {} - realvec(real_t const* as): v(_mm_set_ps(as[3], as[2], as[1], as[0])) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const - { - return vecmathlib::get_elt<RV,vector_t,real_t>(v, n); - } - realvec_t& set_elt(int n, real_t a) - { - return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this; - } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm_load_ps(p); - } - static realvec_t loadu(real_t const* p) - { - return _mm_loadu_ps(p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - if (ioff==0) return loada(p); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } + return r; + } + intvec_t rotate(intvec_t n) const; + intvec_t operator>>(intvec_t n) const { + intvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] >> n[i]); } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); + return r; + } + intvec_t operator<<(intvec_t n) const { + intvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] << n[i]); } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm_store_ps(p, v); + return r; + } + intvec_t &operator>>=(intvec_t n) { return *this = *this >> n; } + intvec_t &operator<<=(intvec_t n) { return *this = *this << n; } + + intvec_t clz() const; + intvec_t popcount() const; + + boolvec_t operator==(intvec_t const &x) const { return !(*this != x); } + boolvec_t operator!=(intvec_t const &x) const { + return (*this ^ x).convert_bool(); + } + boolvec_t operator<(intvec_t const &x) const { + // return (*this - x).as_bool(); + boolvec_t r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] < x[i]); } - void storeu(real_t* p) const - { - return _mm_storeu_ps(p, v); + return r; + } + boolvec_t operator<=(intvec_t const &x) const { return !(*this > x); } + boolvec_t operator>(intvec_t const &x) const { return x < *this; } + boolvec_t operator>=(intvec_t const &x) const { return !(*this < x); } + + intvec_t abs() const; + boolvec_t isignbit() const { return as_bool(); } + intvec_t max(intvec_t x) const; + intvec_t min(intvec_t x) const; +}; + +template <> struct realvec<float, 4> : floatprops<float> { + static int const size = 4; + typedef real_t scalar_t; + typedef __m128 vector_t; + static int const alignment = sizeof(vector_t); + + static char const *name() { return "<SSE2:4*float>"; } + void barrier() { __asm__("" : "+x"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x) : v(x) {} + realvec(real_t a) : v(_mm_set1_ps(a)) {} + realvec(real_t const *as) : v(_mm_set_ps(as[3], as[2], as[1], as[0])) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { + return vecmathlib::get_elt<RV, vector_t, real_t>(v, n); + } + realvec_t &set_elt(int n, real_t a) { + return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this; + } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm_load_ps(p); + } + static realvec_t loadu(real_t const *p) { return _mm_loadu_ps(p); } + static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff); + if (ioff == 0) + return loada(p); + return loadu(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { + } + realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff, m); + return loadu(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm_store_ps(p, v); + } + void storeu(real_t *p) const { return _mm_storeu_ps(p, v); } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff); + storeu(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { #if defined __AVX__ - _mm_maskstore_ps(p, m.m.as_int(), v); + _mm_maskstore_ps(p, m.m.as_int(), v); #else - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; + if (m.m[0]) + p[0] = (*this)[0]; + if (m.m[1]) + p[1] = (*this)[1]; + if (m.m[2]) + p[2] = (*this)[2]; + if (m.m[3]) + p[3] = (*this)[3]; #endif - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return _mm_castps_si128(v); } - intvec_t convert_int() const { return _mm_cvttps_epi32(v); } - - - - realvec_t operator+() const { return *this; } - realvec_t operator-() const { return RV(0.0) - *this; } - - realvec_t operator+(realvec_t x) const { return _mm_add_ps(v, x.v); } - realvec_t operator-(realvec_t x) const { return _mm_sub_ps(v, x.v); } - realvec_t operator*(realvec_t x) const { return _mm_mul_ps(v, x.v); } - realvec_t operator/(realvec_t x) const { return _mm_div_ps(v, x.v); } - - realvec_t& operator+=(realvec_t const& x) { return *this=*this+x; } - realvec_t& operator-=(realvec_t const& x) { return *this=*this-x; } - realvec_t& operator*=(realvec_t const& x) { return *this=*this*x; } - realvec_t& operator/=(realvec_t const& x) { return *this=*this/x; } - - real_t maxval() const - { - // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]), - // vml_std::fmax((*this)[2], (*this)[3])); - realvec_t x0123 = *this; - realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001); - realvec_t y0022 = x0123.fmax(x1032); - return vml_std::fmax(y0022[0], y0022[2]); - } - real_t minval() const - { - // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]), - // vml_std::fmin((*this)[2], (*this)[3])); - realvec_t x0123 = *this; - realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001); - realvec_t y0022 = x0123.fmin(x1032); - return vml_std::fmin(y0022[0], y0022[2]); } - real_t prod() const - { - // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; - realvec_t x0123 = *this; - realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001); - realvec_t y0022 = x0123 * x1032; - return y0022[0] * y0022[2]; + } + void storeu(real_t *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) + p[0] = (*this)[0]; + if (m.m[1]) + p[1] = (*this)[1]; + if (m.m[2]) + p[2] = (*this)[2]; + if (m.m[3]) + p[3] = (*this)[3]; } - real_t sum() const - { + } + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff, m); + storeu(p + ioff, m); + } + + intvec_t as_int() const { return _mm_castps_si128(v); } + intvec_t convert_int() const { return _mm_cvttps_epi32(v); } + + realvec_t operator+() const { return *this; } + realvec_t operator-() const { return RV(0.0) - *this; } + + realvec_t operator+(realvec_t x) const { return _mm_add_ps(v, x.v); } + realvec_t operator-(realvec_t x) const { return _mm_sub_ps(v, x.v); } + realvec_t operator*(realvec_t x) const { return _mm_mul_ps(v, x.v); } + realvec_t operator/(realvec_t x) const { return _mm_div_ps(v, x.v); } + + realvec_t &operator+=(realvec_t const &x) { return *this = *this + x; } + realvec_t &operator-=(realvec_t const &x) { return *this = *this - x; } + realvec_t &operator*=(realvec_t const &x) { return *this = *this * x; } + realvec_t &operator/=(realvec_t const &x) { return *this = *this / x; } + + real_t maxval() const { + // return vml_std::fmax(vml_std::fmax((*this)[0], (*this)[1]), + // vml_std::fmax((*this)[2], (*this)[3])); + realvec_t x0123 = *this; + realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001); + realvec_t y0022 = x0123.fmax(x1032); + return vml_std::fmax(y0022[0], y0022[2]); + } + real_t minval() const { + // return vml_std::fmin(vml_std::fmin((*this)[0], (*this)[1]), + // vml_std::fmin((*this)[2], (*this)[3])); + realvec_t x0123 = *this; + realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001); + realvec_t y0022 = x0123.fmin(x1032); + return vml_std::fmin(y0022[0], y0022[2]); + } + real_t prod() const { + // return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; + realvec_t x0123 = *this; + realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001); + realvec_t y0022 = x0123 * x1032; + return y0022[0] * y0022[2]; + } + real_t sum() const { #ifdef __SSE3__ - realvec_t x = *this; - x = _mm_hadd_ps(x.v, x.v); - x = _mm_hadd_ps(x.v, x.v); - return x[0]; + realvec_t x = *this; + x = _mm_hadd_ps(x.v, x.v); + x = _mm_hadd_ps(x.v, x.v); + return x[0]; #else - // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; - realvec_t x0123 = *this; - realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001); - realvec_t y0022 = x0123 + x1032; - return y0022[0] + y0022[2]; + // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; + realvec_t x0123 = *this; + realvec_t x1032 = _mm_shuffle_ps(x0123, x0123, 0b10110001); + realvec_t y0022 = x0123 + x1032; + return y0022[0] + y0022[2]; #endif - } - - - - boolvec_t operator==(realvec_t const& x) const - { - return _mm_cmpeq_ps(v, x.v); - } - boolvec_t operator!=(realvec_t const& x) const - { - return _mm_cmpneq_ps(v, x.v); - } - boolvec_t operator<(realvec_t const& x) const - { - return _mm_cmplt_ps(v, x.v); - } - boolvec_t operator<=(realvec_t const& x) const - { - return _mm_cmple_ps(v, x.v); - } - boolvec_t operator>(realvec_t const& x) const - { - return _mm_cmpgt_ps(v, x.v); - } - boolvec_t operator>=(realvec_t const& x) const - { - return _mm_cmpge_ps(v, x.v); - } - - - - realvec_t acos() const { return MF::vml_acos(*this); } - realvec_t acosh() const { return MF::vml_acosh(*this); } - realvec_t asin() const { return MF::vml_asin(*this); } - realvec_t asinh() const { return MF::vml_asinh(*this); } - realvec_t atan() const { return MF::vml_atan(*this); } - realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } - realvec_t atanh() const { return MF::vml_atanh(*this); } - realvec_t cbrt() const { return MF::vml_cbrt(*this); } - realvec_t ceil() const - { + } + + boolvec_t operator==(realvec_t const &x) const { + return _mm_cmpeq_ps(v, x.v); + } + boolvec_t operator!=(realvec_t const &x) const { + return _mm_cmpneq_ps(v, x.v); + } + boolvec_t operator<(realvec_t const &x) const { return _mm_cmplt_ps(v, x.v); } + boolvec_t operator<=(realvec_t const &x) const { + return _mm_cmple_ps(v, x.v); + } + boolvec_t operator>(realvec_t const &x) const { return _mm_cmpgt_ps(v, x.v); } + boolvec_t operator>=(realvec_t const &x) const { + return _mm_cmpge_ps(v, x.v); + } + + realvec_t acos() const { return MF::vml_acos(*this); } + realvec_t acosh() const { return MF::vml_acosh(*this); } + realvec_t asin() const { return MF::vml_asin(*this); } + realvec_t asinh() const { return MF::vml_asinh(*this); } + realvec_t atan() const { return MF::vml_atan(*this); } + realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } + realvec_t atanh() const { return MF::vml_atanh(*this); } + realvec_t cbrt() const { return MF::vml_cbrt(*this); } + realvec_t ceil() const { #ifdef __SSE4_1__ - return _mm_ceil_ps(v); + return _mm_ceil_ps(v); #else - return MF::vml_ceil(*this); + return MF::vml_ceil(*this); #endif - } - realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); } - realvec_t cos() const { return MF::vml_cos(*this); } - realvec_t cosh() const { return MF::vml_cosh(*this); } - realvec_t exp() const { return MF::vml_exp(*this); } - realvec_t exp10() const { return MF::vml_exp10(*this); } - realvec_t exp2() const { return MF::vml_exp2(*this); } - realvec_t expm1() const { return MF::vml_expm1(*this); } - realvec_t fabs() const { return MF::vml_fabs(*this); } - realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } - realvec_t floor() const - { + } + realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); } + realvec_t cos() const { return MF::vml_cos(*this); } + realvec_t cosh() const { return MF::vml_cosh(*this); } + realvec_t exp() const { return MF::vml_exp(*this); } + realvec_t exp10() const { return MF::vml_exp10(*this); } + realvec_t exp2() const { return MF::vml_exp2(*this); } + realvec_t expm1() const { return MF::vml_expm1(*this); } + realvec_t fabs() const { return MF::vml_fabs(*this); } + realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } + realvec_t floor() const { #ifdef __SSE4_1__ - return _mm_floor_ps(v); + return _mm_floor_ps(v); #else - return MF::vml_floor(*this); + return MF::vml_floor(*this); #endif - } - realvec_t fma(realvec_t y, realvec_t z) const - { - return MF::vml_fma(*this, y, z); - } - realvec_t fmax(realvec_t y) const { return _mm_max_ps(v, y.v); } - realvec_t fmin(realvec_t y) const { return _mm_min_ps(v, y.v); } - realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); } - realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); } - realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const - { + } + realvec_t fma(realvec_t y, realvec_t z) const { + return MF::vml_fma(*this, y, z); + } + realvec_t fmax(realvec_t y) const { return _mm_max_ps(v, y.v); } + realvec_t fmin(realvec_t y) const { return _mm_min_ps(v, y.v); } + realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); } + realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); } + realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { #if defined VML_HAVE_NAN - return _mm_cmpunord_ps(v, v); + return _mm_cmpunord_ps(v, v); #else - return BV(false); + return BV(false); #endif - } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec_t log() const { return MF::vml_log(*this); } - realvec_t log10() const { return MF::vml_log10(*this); } - realvec_t log1p() const { return MF::vml_log1p(*this); } - realvec_t log2() const { return MF::vml_log2(*this); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return MF::vml_mad(*this, y, z); - } - realvec_t nextafter(realvec_t y) const - { - return MF::vml_nextafter(*this, y); - } - realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } - realvec_t rcp() const - { - realvec_t x = *this; - realvec_t r = _mm_rcp_ps(x); // this is only an approximation - r *= RV(2.0) - r*x; // one Newton iteration (see vml_rcp) - return r; - } - realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); } - realvec_t rint() const - { + } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec_t log() const { return MF::vml_log(*this); } + realvec_t log10() const { return MF::vml_log10(*this); } + realvec_t log1p() const { return MF::vml_log1p(*this); } + realvec_t log2() const { return MF::vml_log2(*this); } + realvec_t mad(realvec_t y, realvec_t z) const { + return MF::vml_mad(*this, y, z); + } + realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); } + realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } + realvec_t rcp() const { + realvec_t x = *this; + realvec_t r = _mm_rcp_ps(x); // this is only an approximation + r *= RV(2.0) - r * x; // one Newton iteration (see vml_rcp) + return r; + } + realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); } + realvec_t rint() const { #ifdef __SSE4_1__ - return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT); + return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT); #else - return MF::vml_rint(*this); + return MF::vml_rint(*this); #endif - } - realvec_t round() const { return MF::vml_round(*this); } - realvec_t rsqrt() const - { - realvec_t x = *this; - realvec_t r = _mm_rsqrt_ps(x); // this is only an approximation - r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt) - return r; - } - boolvec_t signbit() const { return v; } - realvec_t sin() const { return MF::vml_sin(*this); } - realvec_t sinh() const { return MF::vml_sinh(*this); } - realvec_t sqrt() const { return _mm_sqrt_ps(v); } - realvec_t tan() const { return MF::vml_tan(*this); } - realvec_t tanh() const { return MF::vml_tanh(*this); } - realvec_t trunc() const - { + } + realvec_t round() const { return MF::vml_round(*this); } + realvec_t rsqrt() const { + realvec_t x = *this; + realvec_t r = _mm_rsqrt_ps(x); // this is only an approximation + r *= RV(1.5) - RV(0.5) * x * r * r; // one Newton iteration (see vml_rsqrt) + return r; + } + boolvec_t signbit() const { return v; } + realvec_t sin() const { return MF::vml_sin(*this); } + realvec_t sinh() const { return MF::vml_sinh(*this); } + realvec_t sqrt() const { return _mm_sqrt_ps(v); } + realvec_t tan() const { return MF::vml_tan(*this); } + realvec_t tanh() const { return MF::vml_tanh(*this); } + realvec_t trunc() const { #ifdef __SSE4_1__ - return _mm_round_ps(v, _MM_FROUND_TO_ZERO); + return _mm_round_ps(v, _MM_FROUND_TO_ZERO); #else - return MF::vml_trunc(*this); + return MF::vml_trunc(*this); #endif - } - }; - - - - // boolvec definitions - - inline intvec<float,4> boolvec<float,4>::as_int() const - { - return _mm_castps_si128(v); - } - - inline intvec<float,4> boolvec<float,4>::convert_int() const - { - return lsr(as_int(), bits-1); - } - - inline - boolvec<float,4> boolvec<float,4>::ifthen(boolvec_t x, boolvec_t y) const - { - return ifthen(x.as_int(), y.as_int()).as_bool(); - } - - inline intvec<float,4> boolvec<float,4>::ifthen(intvec_t x, intvec_t y) const - { - return ifthen(x.as_float(), y.as_float()).as_int(); - } - - inline - realvec<float,4> boolvec<float,4>::ifthen(realvec_t x, realvec_t y) const - { + } +}; + +// boolvec definitions + +inline intvec<float, 4> boolvec<float, 4>::as_int() const { + return _mm_castps_si128(v); +} + +inline intvec<float, 4> boolvec<float, 4>::convert_int() const { + return lsr(as_int(), bits - 1); +} + +inline boolvec<float, 4> boolvec<float, 4>::ifthen(boolvec_t x, + boolvec_t y) const { + return ifthen(x.as_int(), y.as_int()).as_bool(); +} + +inline intvec<float, 4> boolvec<float, 4>::ifthen(intvec_t x, + intvec_t y) const { + return ifthen(x.as_float(), y.as_float()).as_int(); +} + +inline realvec<float, 4> boolvec<float, 4>::ifthen(realvec_t x, + realvec_t y) const { #ifdef __SSE4_1__ - return _mm_blendv_ps(y.v, x.v, v); + return _mm_blendv_ps(y.v, x.v, v); #else - return (( -convert_int() & x.as_int()) | - (~-convert_int() & y.as_int())).as_float(); + return ((-convert_int() & x.as_int()) | (~ - convert_int() & y.as_int())) + .as_float(); #endif - } +} + +// intvec definitions - - - // intvec definitions - - inline intvec<float,4> intvec<float,4>::abs() const - { +inline intvec<float, 4> intvec<float, 4>::abs() const { #ifdef __SSSE3__ - return _mm_abs_epi32(v); + return _mm_abs_epi32(v); #else - return MF::vml_abs(*this); + return MF::vml_abs(*this); #endif - } - - inline realvec<float,4> intvec<float,4>::as_float() const - { - return _mm_castsi128_ps(v); - } - - inline intvec<float,4> intvec<float,4>::bitifthen(intvec_t x, - intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - inline intvec<float,4> intvec<float,4>::clz() const - { - return MF::vml_clz(*this); - } - - inline realvec<float,4> intvec<float,4>::convert_float() const - { - return _mm_cvtepi32_ps(v); - } - - inline intvec<float,4> intvec<float,4>::max(intvec_t x) const - { +} + +inline realvec<float, 4> intvec<float, 4>::as_float() const { + return _mm_castsi128_ps(v); +} + +inline intvec<float, 4> intvec<float, 4>::bitifthen(intvec_t x, + intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); +} + +inline intvec<float, 4> intvec<float, 4>::clz() const { + return MF::vml_clz(*this); +} + +inline realvec<float, 4> intvec<float, 4>::convert_float() const { + return _mm_cvtepi32_ps(v); +} + +inline intvec<float, 4> intvec<float, 4>::max(intvec_t x) const { #ifdef __SSE4_1__ - return _mm_max_epi32(v, x.v); + return _mm_max_epi32(v, x.v); #else - return MF::vml_max(*this, x); + return MF::vml_max(*this, x); #endif - } - - inline intvec<float,4> intvec<float,4>::min(intvec_t x) const - { +} + +inline intvec<float, 4> intvec<float, 4>::min(intvec_t x) const { #ifdef __SSE4_1__ - return _mm_min_epi32(v, x.v); + return _mm_min_epi32(v, x.v); #else - return MF::vml_min(*this, x); + return MF::vml_min(*this, x); #endif - } - - inline intvec<float,4> intvec<float,4>::popcount() const - { - return MF::vml_popcount(*this); - } - - inline intvec<float,4> intvec<float,4>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); - } - - inline intvec<float,4> intvec<float,4>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); - } - +} + +inline intvec<float, 4> intvec<float, 4>::popcount() const { + return MF::vml_popcount(*this); +} + +inline intvec<float, 4> intvec<float, 4>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +inline intvec<float, 4> intvec<float, 4>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + } // namespace vecmathlib -#endif // #ifndef VEC_SSE_FLOAT4_H +#endif // #ifndef VEC_SSE_FLOAT4_H @@ -9,1474 +9,1280 @@ #include <cmath> #ifndef VML_NO_IOSTREAM -# include <sstream> +#include <sstream> #endif +namespace vecmathlib { +template <typename T, int N> struct booltestvec; +template <typename T, int N> struct inttestvec; +template <typename T, int N> struct realtestvec; + +template <typename T, int N> struct booltestvec : floatprops<T> { + typedef typename floatprops<T>::int_t int_t; + typedef typename floatprops<T>::uint_t uint_t; + typedef typename floatprops<T>::real_t real_t; + + static int const size = N; + typedef bool scalar_t; + typedef bool bvector_t[size]; + static int const alignment = sizeof(bool); + + typedef booltestvec boolvec_t; + typedef inttestvec<real_t, size> intvec_t; + typedef realtestvec<real_t, size> realvec_t; + + // short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + booltestvec() {} + // can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // booltestvec(booltestvec const& x): v(x.v) {} + // booltestvec& operator=(booltestvec const& x) { return v=x.v, *this; } + // booltestvec(vector_t x): v(x) {} + booltestvec(bool a) { + for (int d = 0; d < size; ++d) + v[d] = a; + } + booltestvec(bool const *as) { + for (int d = 0; d < size; ++d) + v[d] = as[d]; + } + + bool operator[](int n) const { return v[n]; } + boolvec_t &set_elt(int n, bool a) { return v[n] = a, *this; } + + intvec_t as_int() const; // defined after inttestvec + intvec_t convert_int() const; // defined after inttestvec + + boolvec_t operator!() const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = !v[d]; + return res; + } + + boolvec_t operator&&(boolvec_t x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] && x.v[d]; + return res; + } + boolvec_t operator||(boolvec_t x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] || x.v[d]; + return res; + } + boolvec_t operator==(boolvec_t x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] == x.v[d]; + return res; + } + boolvec_t operator!=(boolvec_t x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] != x.v[d]; + return res; + } + + bool all() const { + bool res = v[0]; + for (int d = 1; d < size; ++d) + res = res && v[d]; + return res; + } + bool any() const { + bool res = v[0]; + for (int d = 1; d < size; ++d) + res = res || v[d]; + return res; + } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after inttestvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realtestvec +}; + +template <typename T, int N> struct inttestvec : floatprops<T> { + typedef typename floatprops<T>::int_t int_t; + typedef typename floatprops<T>::uint_t uint_t; + typedef typename floatprops<T>::real_t real_t; + + static int const size = N; + typedef int_t scalar_t; + typedef int_t ivector_t[size]; + static int const alignment = sizeof(int_t); + + typedef booltestvec<real_t, size> boolvec_t; + typedef inttestvec intvec_t; + typedef realtestvec<real_t, size> realvec_t; + + // short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + inttestvec() {} + // can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // inttestvec(inttestvec const& x): v(x.v) {} + // inttestvec& operator=(inttestvec const& x) { return v=x.v, *this; } + // inttestvec(vector_t x): v(x) {} + inttestvec(int_t a) { + for (int d = 0; d < size; ++d) + v[d] = a; + } + inttestvec(int_t const *as) { + for (int d = 0; d < size; ++d) + v[d] = as[d]; + } + static intvec_t iota() { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = d; + return res; + } + + int_t operator[](int n) const { return v[n]; } + intvec_t &set_elt(int n, int_t a) { return v[n] = a, *this; } + + boolvec_t as_bool() const { return convert_bool(); } + boolvec_t convert_bool() const { + // result: convert_bool(0)=false, convert_bool(else)=true + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d]; + return res; + } + realvec_t as_float() const; // defined after realtestvec + realvec_t convert_float() const; // defined after realtestvec + + intvec_t operator+() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = +v[d]; + return res; + } + intvec_t operator-() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = -v[d]; + return res; + } + + intvec_t &operator+=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] += x.v[d]; + return *this; + } + intvec_t &operator-=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] -= x.v[d]; + return *this; + } + intvec_t &operator*=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] *= x.v[d]; + return *this; + } + intvec_t &operator/=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] /= x.v[d]; + return *this; + } + intvec_t &operator%=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] %= x.v[d]; + return *this; + } + + intvec_t operator+(intvec_t x) const { + intvec_t res = *this; + return res += x; + } + intvec_t operator-(intvec_t x) const { + intvec_t res = *this; + return res -= x; + } + intvec_t operator*(intvec_t x) const { + intvec_t res = *this; + return res *= x; + } + intvec_t operator/(intvec_t x) const { + intvec_t res = *this; + return res /= x; + } + intvec_t operator%(intvec_t x) const { + intvec_t res = *this; + return res %= x; + } + + intvec_t operator~() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = ~v[d]; + return res; + } + + intvec_t &operator&=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] &= x.v[d]; + return *this; + } + intvec_t &operator|=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] |= x.v[d]; + return *this; + } + intvec_t &operator^=(intvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] ^= x.v[d]; + return *this; + } + + intvec_t operator&(intvec_t x) const { + intvec_t res = *this; + return res &= x; + } + intvec_t operator|(intvec_t x) const { + intvec_t res = *this; + return res |= x; + } + intvec_t operator^(intvec_t x) const { + intvec_t res = *this; + return res ^= x; + } + + intvec_t bitifthen(intvec_t x, intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); + } + + intvec_t lsr(int_t n) const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = I(U(v[d]) >> U(n)); + return res; + } + intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); } + intvec_t &operator>>=(int_t n) { + for (int d = 0; d < size; ++d) + v[d] >>= n; + return *this; + } + intvec_t &operator<<=(int_t n) { + for (int d = 0; d < size; ++d) + v[d] <<= n; + return *this; + } + intvec_t operator>>(int_t n) const { + intvec_t res = *this; + return res >>= n; + } + intvec_t operator<<(int_t n) const { + intvec_t res = *this; + return res <<= n; + } + + intvec_t lsr(intvec_t n) const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = I(U(v[d]) >> U(n.v[d])); + return res; + } + intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); } + intvec_t &operator>>=(intvec_t n) { + for (int d = 0; d < size; ++d) + v[d] >>= n.v[d]; + return *this; + } + intvec_t &operator<<=(intvec_t n) { + for (int d = 0; d < size; ++d) + v[d] <<= n.v[d]; + return *this; + } + intvec_t operator>>(intvec_t n) const { + intvec_t res = *this; + return res >>= n; + } + intvec_t operator<<(intvec_t n) const { + intvec_t res = *this; + return res <<= n; + } + + intvec_t clz() const { return MF::vml_clz(*this); } + intvec_t popcount() const { return MF::vml_popcount(*this); } + + boolvec_t operator==(intvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] == x.v[d]; + return res; + } + boolvec_t operator!=(intvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] != x.v[d]; + return res; + } + boolvec_t operator<(intvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] < x.v[d]; + return res; + } + boolvec_t operator<=(intvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] <= x.v[d]; + return res; + } + boolvec_t operator>(intvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] > x.v[d]; + return res; + } + boolvec_t operator>=(intvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] >= x.v[d]; + return res; + } + + intvec_t abs() const { return MF::vml_abs(*this); } + boolvec_t isignbit() const { return MF::vml_isignbit(*this); } + intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); } + intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); } +}; + +template <typename T, int N> struct realtestvec : floatprops<T> { + typedef typename floatprops<T>::int_t int_t; + typedef typename floatprops<T>::uint_t uint_t; + typedef typename floatprops<T>::real_t real_t; + + static int const size = N; + typedef real_t scalar_t; + typedef real_t vector_t[size]; + static int const alignment = sizeof(real_t); -namespace vecmathlib { - - template<typename T, int N> struct booltestvec; - template<typename T, int N> struct inttestvec; - template<typename T, int N> struct realtestvec; - - - - template<typename T, int N> - struct booltestvec: floatprops<T> - { - typedef typename floatprops<T>::int_t int_t; - typedef typename floatprops<T>::uint_t uint_t; - typedef typename floatprops<T>::real_t real_t; - - static int const size = N; - typedef bool scalar_t; - typedef bool bvector_t[size]; - static int const alignment = sizeof(bool); - - typedef booltestvec boolvec_t; - typedef inttestvec<real_t, size> intvec_t; - typedef realtestvec<real_t, size> realvec_t; - - // short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - booltestvec() {} - // can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // booltestvec(booltestvec const& x): v(x.v) {} - // booltestvec& operator=(booltestvec const& x) { return v=x.v, *this; } - //booltestvec(vector_t x): v(x) {} - booltestvec(bool a) { for (int d=0; d<size; ++d) v[d]=a; } - booltestvec(bool const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; } - - bool operator[](int n) const { return v[n]; } - boolvec_t& set_elt(int n, bool a) { return v[n]=a, *this; } - - - - intvec_t as_int() const; // defined after inttestvec - intvec_t convert_int() const; // defined after inttestvec - - - - boolvec_t operator!() const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = !v[d]; - return res; - } - - boolvec_t operator&&(boolvec_t x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] && x.v[d]; - return res; - } - boolvec_t operator||(boolvec_t x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] || x.v[d]; - return res; - } - boolvec_t operator==(boolvec_t x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d]; - return res; - } - boolvec_t operator!=(boolvec_t x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d]; - return res; - } - - bool all() const - { - bool res = v[0]; - for (int d=1; d<size; ++d) res = res && v[d]; - return res; - } - bool any() const - { - bool res = v[0]; - for (int d=1; d<size; ++d) res = res || v[d]; - return res; - } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after inttestvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realtestvec - }; - - - - template<typename T, int N> - struct inttestvec: floatprops<T> - { - typedef typename floatprops<T>::int_t int_t; - typedef typename floatprops<T>::uint_t uint_t; - typedef typename floatprops<T>::real_t real_t; - - static int const size = N; - typedef int_t scalar_t; - typedef int_t ivector_t[size]; - static int const alignment = sizeof(int_t); - - typedef booltestvec<real_t, size> boolvec_t; - typedef inttestvec intvec_t; - typedef realtestvec<real_t, size> realvec_t; - - // short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - inttestvec() {} - // can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // inttestvec(inttestvec const& x): v(x.v) {} - // inttestvec& operator=(inttestvec const& x) { return v=x.v, *this; } - //inttestvec(vector_t x): v(x) {} - inttestvec(int_t a) { for (int d=0; d<size; ++d) v[d]=a; } - inttestvec(int_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; } - static intvec_t iota() - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d]=d; - return res; - } - - int_t operator[](int n) const { return v[n]; } - intvec_t& set_elt(int n, int_t a) { return v[n]=a, *this; } - - - - boolvec_t as_bool() const { return convert_bool(); } - boolvec_t convert_bool() const - { - // result: convert_bool(0)=false, convert_bool(else)=true - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d]=v[d]; - return res; - } - realvec_t as_float() const; // defined after realtestvec - realvec_t convert_float() const; // defined after realtestvec - - - - intvec_t operator+() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = + v[d]; - return res; - } - intvec_t operator-() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = - v[d]; - return res; - } - - intvec_t& operator+=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] += x.v[d]; - return *this; - } - intvec_t& operator-=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] -= x.v[d]; - return *this; - } - intvec_t& operator*=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] *= x.v[d]; - return *this; - } - intvec_t& operator/=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] /= x.v[d]; - return *this; - } - intvec_t& operator%=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] %= x.v[d]; - return *this; - } - - intvec_t operator+(intvec_t x) const - { - intvec_t res = *this; - return res += x; - } - intvec_t operator-(intvec_t x) const - { - intvec_t res = *this; - return res -= x; - } - intvec_t operator*(intvec_t x) const - { - intvec_t res = *this; - return res *= x; - } - intvec_t operator/(intvec_t x) const - { - intvec_t res = *this; - return res /= x; - } - intvec_t operator%(intvec_t x) const - { - intvec_t res = *this; - return res %= x; - } - - - - intvec_t operator~() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = ~ v[d]; - return res; - } - - intvec_t& operator&=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] &= x.v[d]; - return *this; - } - intvec_t& operator|=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] |= x.v[d]; - return *this; - } - intvec_t& operator^=(intvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] ^= x.v[d]; - return *this; - } - - intvec_t operator&(intvec_t x) const - { - intvec_t res = *this; - return res &= x; - } - intvec_t operator|(intvec_t x) const - { - intvec_t res = *this; - return res |= x; - } - intvec_t operator^(intvec_t x) const - { - intvec_t res = *this; - return res ^= x; - } - - intvec_t bitifthen(intvec_t x, intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - - - intvec_t lsr(int_t n) const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n)); - return res; - } - intvec_t rotate(int_t n) const { return MF::vml_rotate(*this, n); } - intvec_t& operator>>=(int_t n) - { - for (int d=0; d<size; ++d) v[d] >>= n; - return *this; - } - intvec_t& operator<<=(int_t n) - { - for (int d=0; d<size; ++d) v[d] <<= n; - return *this; - } - intvec_t operator>>(int_t n) const - { - intvec_t res = *this; - return res >>= n; - } - intvec_t operator<<(int_t n) const - { - intvec_t res = *this; - return res <<= n; - } - - intvec_t lsr(intvec_t n) const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = I(U(v[d]) >> U(n.v[d])); - return res; - } - intvec_t rotate(intvec_t n) const { return MF::vml_rotate(*this, n); } - intvec_t& operator>>=(intvec_t n) - { - for (int d=0; d<size; ++d) v[d] >>= n.v[d]; - return *this; - } - intvec_t& operator<<=(intvec_t n) - { - for (int d=0; d<size; ++d) v[d] <<= n.v[d]; - return *this; - } - intvec_t operator>>(intvec_t n) const - { - intvec_t res = *this; - return res >>= n; - } - intvec_t operator<<(intvec_t n) const - { - intvec_t res = *this; - return res <<= n; - } - - intvec_t clz() const { return MF::vml_clz(*this); } - intvec_t popcount() const { return MF::vml_popcount(*this); } - - - - boolvec_t operator==(intvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d]; - return res; - } - boolvec_t operator!=(intvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d]; - return res; - } - boolvec_t operator<(intvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d]; - return res; - } - boolvec_t operator<=(intvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d]; - return res; - } - boolvec_t operator>(intvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d]; - return res; - } - boolvec_t operator>=(intvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d]; - return res; - } - - intvec_t abs() const { return MF::vml_abs(*this); } - boolvec_t isignbit() const { return MF::vml_isignbit(*this); } - intvec_t max(intvec_t x) const { return MF::vml_max(*this, x); } - intvec_t min(intvec_t x) const { return MF::vml_min(*this, x); } - }; - - - - template<typename T, int N> - struct realtestvec: floatprops<T> - { - typedef typename floatprops<T>::int_t int_t; - typedef typename floatprops<T>::uint_t uint_t; - typedef typename floatprops<T>::real_t real_t; - - static int const size = N; - typedef real_t scalar_t; - typedef real_t vector_t[size]; - static int const alignment = sizeof(real_t); - #ifndef VML_NO_IOSTREAM - static char const* name() - { - static std::string name_; - if (name_.empty()) { - std::stringstream buf; - buf << "<VML:" << N << "*" << FP::name() << ">"; - name_ = buf.str(); - } - return name_.c_str(); + static char const *name() { + static std::string name_; + if (name_.empty()) { + std::stringstream buf; + buf << "<VML:" << N << "*" << FP::name() << ">"; + name_ = buf.str(); } + return name_.c_str(); + } #endif - void barrier() - { + void barrier() { #if defined __GNUC__ && !defined __clang__ && !defined __ICC - // GCC crashes when +X is used as constraint -# if defined __SSE2__ - for (int d=0; d<size; ++d) __asm__("": "+x"(v[d])); -# elif defined __PPC64__ // maybe also __PPC__ - for (int d=0; d<size; ++d) __asm__("": "+f"(v[d])); -# elif defined __arm__ - for (int d=0; d<size; ++d) __asm__("": "+w"(v[d])); -# else -# error "Floating point barrier undefined on this architecture" -# endif +// GCC crashes when +X is used as constraint +#if defined __SSE2__ + for (int d = 0; d < size; ++d) + __asm__("" : "+x"(v[d])); +#elif defined __PPC64__ // maybe also __PPC__ + for (int d = 0; d < size; ++d) + __asm__("" : "+f"(v[d])); +#elif defined __arm__ + for (int d = 0; d < size; ++d) + __asm__("" : "+w"(v[d])); +#else +#error "Floating point barrier undefined on this architecture" +#endif #elif defined __clang__ - for (int d=0; d<size; ++d) __asm__("": "+X"(v[d])); + for (int d = 0; d < size; ++d) + __asm__("" : "+X"(v[d])); #elif defined __ICC - for (int d=0; d<size; ++d) { - real_t tmp = v[d]; - __asm__("": "+X"(tmp)); - v[d] = tmp; - } + for (int d = 0; d < size; ++d) { + real_t tmp = v[d]; + __asm__("" : "+X"(tmp)); + v[d] = tmp; + } #elif defined __IBMCPP__ - for (int d=0; d<size; ++d) __asm__("": "+f"(v[d])); + for (int d = 0; d < size; ++d) + __asm__("" : "+f"(v[d])); #else -# error "Floating point barrier undefined on this architecture" +#error "Floating point barrier undefined on this architecture" #endif - } - - typedef booltestvec<real_t, size> boolvec_t; - typedef inttestvec<real_t, size> intvec_t; - typedef realtestvec realvec_t; - - // short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realtestvec() {} - // can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realtestvec(realtestvec const& x): v(x.v) {} - // realtestvec& operator=(realtestvec const& x) { return v=x.v, *this; } - //realtestvec(vector_t x): v(x) {} - realtestvec(real_t a) { for (int d=0; d<size; ++d) v[d]=a; } - realtestvec(real_t const* as) { for (int d=0; d<size; ++d) v[d]=as[d]; } - - real_t operator[](int n) const { return v[n]; } - realvec_t& set_elt(int n, real_t a) { return v[n]=a, *this; } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loadu(p); - } - static realvec_t loadu(real_t const* p) - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = p[d]; - return res; - } - static realvec_t loadu(real_t const* p, size_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - return m.m.ifthen(loada(p), *this); - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - return m.m.ifthen(loadu(p), *this); - } - realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const - { - return m.m.ifthen(loadu(p, ioff), *this); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storeu(p); - } - void storeu(real_t* p) const - { - for (int d=0; d<size; ++d) p[d] = v[d]; - } - void storeu(real_t* p, size_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storeu(p, m); - } - void storeu(real_t* p, mask_t const& m) const - { - for (int d=0; d<size; ++d) if (m.m[d]) p[d] = v[d]; - } - void storeu(real_t* p, size_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = FP::as_int(v[d]); - return res; - } - intvec_t convert_int() const { return MF::vml_convert_int(*this); } - - - - realvec_t operator+() const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = + v[d]; - return res; - } - realvec_t operator-() const - { - realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = - v[d]; - return res; - } - - realvec_t& operator+=(realvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] += x.v[d]; - return *this; - } - realvec_t& operator-=(realvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] -= x.v[d]; - return *this; - } - realvec_t& operator*=(realvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] *= x.v[d]; - return *this; - } - realvec_t& operator/=(realvec_t const& x) - { - for (int d=0; d<size; ++d) v[d] /= x.v[d]; - return *this; - } - - realvec_t operator+(realvec_t x) const - { - realvec_t res = *this; - return res += x; - } - realvec_t operator-(realvec_t x) const - { - realvec_t res = *this; - return res -= x; - } - realvec_t operator*(realvec_t x) const - { - realvec_t res = *this; - return res *= x; - } - realvec_t operator/(realvec_t x) const - { - realvec_t res = *this; - return res /= x; - } - - real_t maxval() const - { - real_t res = v[0]; - for (int d=1; d<size; ++d) res = vml_std::fmax(res, v[d]); - return res; - } - real_t minval() const - { - real_t res = v[0]; - for (int d=1; d<size; ++d) res = vml_std::fmin(res, v[d]); - return res; - } - real_t prod() const - { - real_t res = v[0]; - for (int d=1; d<size; ++d) res *= v[d]; - return res; - } - real_t sum() const - { - real_t res = v[0]; - for (int d=1; d<size; ++d) res += v[d]; - return res; - } - - - - boolvec_t operator==(realvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] == x.v[d]; - return res; - } - boolvec_t operator!=(realvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] != x.v[d]; - return res; - } - boolvec_t operator<(realvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] < x.v[d]; - return res; - } - boolvec_t operator<=(realvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] <= x.v[d]; - return res; - } - boolvec_t operator>(realvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] > x.v[d]; - return res; - } - boolvec_t operator>=(realvec_t const& x) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] >= x.v[d]; - return res; - } - - - - realvec_t acos() const { return MF::vml_acos(*this); } - realvec_t acosh() const { return MF::vml_acosh(*this); } - realvec_t asin() const { return MF::vml_asin(*this); } - realvec_t asinh() const { return MF::vml_asinh(*this); } - realvec_t atan() const { return MF::vml_atan(*this); } - realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } - realvec_t atanh() const { return MF::vml_atanh(*this); } - realvec_t cbrt() const { return MF::vml_cbrt(*this); } - realvec_t ceil() const { return MF::vml_ceil(*this); } - realvec_t copysign(realvec_t y) const - { - return MF::vml_copysign(*this, y); - } - realvec_t cos() const { return MF::vml_cos(*this); } - realvec_t cosh() const { return MF::vml_cosh(*this); } - realvec_t exp() const { return MF::vml_exp(*this); } - realvec_t exp10() const { return MF::vml_exp10(*this); } - realvec_t exp2() const { return MF::vml_exp2(*this); } - realvec_t expm1() const { return MF::vml_expm1(*this); } - realvec_t fabs() const { return MF::vml_fabs(*this); } - realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } - realvec_t floor() const { return MF::vml_floor(*this); } - realvec_t fma(realvec_t y, realvec_t z) const - { - return MF::vml_fma(*this, y, z); - } - realvec_t fmax(realvec_t y) const { return MF::vml_fmax(*this, y); } - realvec_t fmin(realvec_t y) const { return MF::vml_fmin(*this, y); } - realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); } - realvec_t frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); } - realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return MF::vml_isnan(*this); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec_t log() const { return MF::vml_log(*this); } - realvec_t log10() const { return MF::vml_log10(*this); } - realvec_t log1p() const { return MF::vml_log1p(*this); } - realvec_t log2() const { return MF::vml_log2(*this); } - intvec_t lrint() const { return MF::vml_lrint(*this); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return MF::vml_mad(*this, y, z); - } - realvec_t nextafter(realvec_t y) const - { - return MF::vml_nextafter(*this, y); - } - realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } - realvec_t rcp() const { return MF::vml_rcp(*this); } - realvec_t remainder(realvec_t y) const - { - return MF::vml_remainder(*this, y); - } - realvec_t rint() const { return MF::vml_rint(*this); } - realvec_t round() const { return MF::vml_round(*this); } - realvec_t rsqrt() const { return MF::vml_rsqrt(*this); } - boolvec_t signbit() const { return MF::vml_signbit(*this); } - realvec_t sin() const { return MF::vml_sin(*this); } - realvec_t sinh() const { return MF::vml_sinh(*this); } - realvec_t sqrt() const { return MF::vml_sqrt(*this); } - realvec_t tan() const { return MF::vml_tan(*this); } - realvec_t tanh() const { return MF::vml_tanh(*this); } - realvec_t trunc() const { return MF::vml_trunc(*this); } - }; - - - - // booltestvec definitions - - template<typename T, int N> - inline - typename booltestvec<T,N>::intvec_t - booltestvec<T,N>::as_int() const - { - return convert_int(); - } - - template<typename T, int N> - inline - typename booltestvec<T,N>::intvec_t - booltestvec<T,N>::convert_int() const - { - intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d]; - return res; } - - template<typename T, int N> - inline - typename booltestvec<T,N>::boolvec_t - booltestvec<T,N>::ifthen(boolvec_t x, boolvec_t y) const - { - boolvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d]; + + typedef booltestvec<real_t, size> boolvec_t; + typedef inttestvec<real_t, size> intvec_t; + typedef realtestvec realvec_t; + + // short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realtestvec() {} + // can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realtestvec(realtestvec const& x): v(x.v) {} + // realtestvec& operator=(realtestvec const& x) { return v=x.v, *this; } + // realtestvec(vector_t x): v(x) {} + realtestvec(real_t a) { + for (int d = 0; d < size; ++d) + v[d] = a; + } + realtestvec(real_t const *as) { + for (int d = 0; d < size; ++d) + v[d] = as[d]; + } + + real_t operator[](int n) const { return v[n]; } + realvec_t &set_elt(int n, real_t a) { return v[n] = a, *this; } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loadu(p); + } + static realvec_t loadu(real_t const *p) { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = p[d]; return res; } - - template<typename T, int N> - inline - typename booltestvec<T,N>::intvec_t - booltestvec<T,N>::ifthen(intvec_t x, intvec_t y) const - { + static realvec_t loadu(real_t const *p, size_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loadu(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + return m.m.ifthen(loada(p), *this); + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + return m.m.ifthen(loadu(p), *this); + } + realvec_t loadu(real_t const *p, size_t ioff, mask_t const &m) const { + return m.m.ifthen(loadu(p, ioff), *this); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storeu(p); + } + void storeu(real_t *p) const { + for (int d = 0; d < size; ++d) + p[d] = v[d]; + } + void storeu(real_t *p, size_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storeu(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storeu(p, m); + } + void storeu(real_t *p, mask_t const &m) const { + for (int d = 0; d < size; ++d) + if (m.m[d]) + p[d] = v[d]; + } + void storeu(real_t *p, size_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + storeu(p + ioff, m); + } + + intvec_t as_int() const { intvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d]; + for (int d = 0; d < size; ++d) + res.v[d] = FP::as_int(v[d]); return res; } - - template<typename T, int N> - inline - typename booltestvec<T,N>::realvec_t - booltestvec<T,N>::ifthen(realvec_t x, realvec_t y) const - { + intvec_t convert_int() const { return MF::vml_convert_int(*this); } + + realvec_t operator+() const { realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = v[d] ? x.v[d] : y.v[d]; + for (int d = 0; d < size; ++d) + res.v[d] = +v[d]; return res; } - - - - // inttestvec definitions - - template<typename T, int N> - inline - typename inttestvec<T,N>::realvec_t - inttestvec<T,N>::as_float() const - { + realvec_t operator-() const { realvec_t res; - for (int d=0; d<size; ++d) res.v[d] = FP::as_float(v[d]); + for (int d = 0; d < size; ++d) + res.v[d] = -v[d]; return res; } - - template<typename T, int N> - inline - typename inttestvec<T,N>::realvec_t - inttestvec<T,N>::convert_float() const - { - return MF::vml_convert_float(*this); - } - - - - // Wrappers - - // booltestvec wrappers - - template<typename real_t, int size> - inline - inttestvec<real_t, size> as_int(booltestvec<real_t, size> x) - { - return x.as_int(); - } - - template<typename real_t, int size> - inline - inttestvec<real_t, size> convert_int(booltestvec<real_t, size> x) - { - return x.convert_int(); - } - - template<typename real_t, int size> - inline bool all(booltestvec<real_t, size> x) { return x.all(); } - - template<typename real_t, int size> - inline bool any(booltestvec<real_t, size> x) { return x.any(); } - - template<typename real_t, int size> - inline - booltestvec<real_t, size> ifthen(booltestvec<real_t, size> c, - booltestvec<real_t, size> x, - booltestvec<real_t, size> y) - { - return c.ifthen(x, y); - } - - template<typename real_t, int size> - inline - inttestvec<real_t, size> ifthen(booltestvec<real_t, size> c, - inttestvec<real_t, size> x, - inttestvec<real_t, size> y) - { - return c.ifthen(x, y); - } - - template<typename real_t, int size> - inline - realtestvec<real_t, size> ifthen(booltestvec<real_t, size> c, - realtestvec<real_t, size> x, - realtestvec<real_t, size> y) - { - return c.ifthen(x, y); - } - - - - // inttestvec wrappers - - template<typename real_t, int size> - inline inttestvec<real_t, size> abs(inttestvec<real_t, size> x) - { - return x.abs(); - } - - template<typename real_t, int size> - inline booltestvec<real_t, size> as_bool(inttestvec<real_t, size> x) - { - return x.as_bool(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> as_float(inttestvec<real_t, size> x) - { - return x.as_float(); - } - - template<typename real_t, int size> - inline - inttestvec<real_t, size> bitifthen(inttestvec<real_t, size> x, - inttestvec<real_t, size> y, - inttestvec<real_t, size> z) - { - return x.bitifthen(y, z); - } - - template<typename real_t, int size> - inline inttestvec<real_t, size> clz(inttestvec<real_t, size> x) - { - return x.clz(); - } - - template<typename real_t, int size> - inline booltestvec<real_t, size> convert_bool(inttestvec<real_t, size> x) - { - return x.convert_bool(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> convert_float(inttestvec<real_t, size> x) - { - return x.convert_float(); - } - - template<typename real_t, int size> - inline booltestvec<real_t, size> isignbit(inttestvec<real_t, size> x) - { - return x.isignbit(); - } - - template<typename real_t, int size> - inline - inttestvec<real_t, size> lsr(inttestvec<real_t, size> x, - typename inttestvec<real_t, size>::int_t n) - { - return x.lsr(n); - } - - template<typename real_t, int size> - inline - inttestvec<real_t, size> lsr(inttestvec<real_t, size> x, - inttestvec<real_t, size> n) - { - return x.lsr(n); - } - - template<typename real_t, int size> - inline - inttestvec<real_t, size> max(inttestvec<real_t, size> x, - inttestvec<real_t, size> y) - { - return x.max(y); - } - - template<typename real_t, int size> - inline - inttestvec<real_t, size> min(inttestvec<real_t, size> x, - inttestvec<real_t, size> y) - { - return x.min(y); - } - - template<typename real_t, int size> - inline - inttestvec<real_t, size> popcount(inttestvec<real_t, size> x) - { - return x.popcount(); - } - - template<typename real_t, int size> - inline - inttestvec<real_t, size> rotate(inttestvec<real_t, size> x, - typename inttestvec<real_t, size>::int_t n) - { - return x.rotate(n); - } - - template<typename real_t, int size> - inline - inttestvec<real_t, size> rotate(inttestvec<real_t, size> x, - inttestvec<real_t, size> n) - { - return x.rotate(n); - } - - - - // realtestvec wrappers - - template<typename real_t, int size> - inline - realtestvec<real_t, size> - loada(real_t const* p, - realtestvec<real_t, size> x, - typename realtestvec<real_t, size>::mask_t const& m) - { - return x.loada(p, m); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> - loadu(real_t const* p, - realtestvec<real_t, size> x, - typename realtestvec<real_t, size>::mask_t const& m) - { - return x.loadu(p, m); - } - - template<typename real_t, int size> - inline - realtestvec<real_t, size> - loadu(real_t const* p, size_t ioff, - realtestvec<real_t, size> x, - typename realtestvec<real_t, size>::mask_t const& m) - { - return x.loadu(p, ioff, m); - } - - template<typename real_t, int size> - inline void storea(realtestvec<real_t, size> x, real_t* p) - { - return x.storea(p); - } - - template<typename real_t, int size> - inline void storeu(realtestvec<real_t, size> x, real_t* p) - { - return x.storeu(p); - } - - template<typename real_t, int size> - inline void storeu(realtestvec<real_t, size> x, real_t* p, size_t ioff) - { - return x.storeu(p, ioff); - } - - template<typename real_t, int size> - inline void storea(realtestvec<real_t, size> x, real_t* p, - typename realtestvec<real_t, size>::mask_t const& m) - { - return x.storea(p, m); - } - - template<typename real_t, int size> - inline void storeu(realtestvec<real_t, size> x, real_t* p, - typename realtestvec<real_t, size>::mask_t const& m) - { - return x.storeu(p, m); - } - - template<typename real_t, int size> - inline void storeu(realtestvec<real_t, size> x, real_t* p, size_t ioff, - typename realtestvec<real_t, size>::mask_t const& m) - { - return x.storeu(p, ioff, m); - } - - - - template<typename real_t, int size> - inline inttestvec<real_t, size> as_int(realtestvec<real_t, size> x) - { - return x.as_int(); - } - - template<typename real_t, int size> - inline inttestvec<real_t, size> convert_int(realtestvec<real_t, size> x) - { - return x.convert_int(); - } - - template<typename real_t, int size> - inline real_t maxval(realtestvec<real_t, size> x) - { - return x.maxval(); - } - - template<typename real_t, int size> - inline real_t minval(realtestvec<real_t, size> x) - { - return x.minval(); - } - - template<typename real_t, int size> - inline real_t prod(realtestvec<real_t, size> x) - { - return x.prod(); - } - - template<typename real_t, int size> - inline real_t sum(realtestvec<real_t, size> x) - { - return x.sum(); - } - - - - template<typename real_t, int size> - inline realtestvec<real_t, size> acos(realtestvec<real_t, size> x) - { - return x.acos(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> acosh(realtestvec<real_t, size> x) - { - return x.acosh(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> asin(realtestvec<real_t, size> x) - { - return x.asin(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> asinh(realtestvec<real_t, size> x) - { - return x.asinh(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> atan(realtestvec<real_t, size> x) - { - return x.atan(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> atan2(realtestvec<real_t, size> x, - realtestvec<real_t, size> y) - { - return x.atan2(y); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> atanh(realtestvec<real_t, size> x) - { - return x.atanh(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> cbrt(realtestvec<real_t, size> x) - { - return x.cbrt(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> ceil(realtestvec<real_t, size> x) - { - return x.ceil(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> copysign(realtestvec<real_t, size> x, - realtestvec<real_t, size> y) - { - return x.copysign(y); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> cos(realtestvec<real_t, size> x) - { - return x.cos(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> cosh(realtestvec<real_t, size> x) - { - return x.cosh(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> exp(realtestvec<real_t, size> x) - { - return x.exp(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> exp10(realtestvec<real_t, size> x) - { - return x.exp10(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> exp2(realtestvec<real_t, size> x) - { - return x.exp2(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> expm1(realtestvec<real_t, size> x) - { - return x.expm1(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> fabs(realtestvec<real_t, size> x) - { - return x.fabs(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> floor(realtestvec<real_t, size> x) - { - return x.floor(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> fdim(realtestvec<real_t, size> x, - realtestvec<real_t, size> y) - { - return x.fdim(y); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> fma(realtestvec<real_t, size> x, - realtestvec<real_t, size> y, - realtestvec<real_t, size> z) - { - return x.fma(y, z); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> fmax(realtestvec<real_t, size> x, - realtestvec<real_t, size> y) - { - return x.fmax(y); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> fmin(realtestvec<real_t, size> x, - realtestvec<real_t, size> y) - { - return x.fmin(y); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> fmod(realtestvec<real_t, size> x, - realtestvec<real_t, size> y) - { - return x.fmod(y); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> frexp(realtestvec<real_t, size> x, - inttestvec<real_t, size>* r) - { - return x.frexp(r); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> hypot(realtestvec<real_t, size> x, - realtestvec<real_t, size> y) - { - return x.hypot(y); - } - - template<typename real_t, int size> - inline inttestvec<real_t, size> ilogb(realtestvec<real_t, size> x) - { - return x.ilogb(); - } - - template<typename real_t, int size> - inline booltestvec<real_t, size> isfinite(realtestvec<real_t, size> x) - { - return x.isfinite(); - } - - template<typename real_t, int size> - inline booltestvec<real_t, size> isinf(realtestvec<real_t, size> x) - { - return x.isinf(); - } - - template<typename real_t, int size> - inline booltestvec<real_t, size> isnan(realtestvec<real_t, size> x) - { - return x.isnan(); - } - - template<typename real_t, int size> - inline booltestvec<real_t, size> isnormal(realtestvec<real_t, size> x) - { - return x.isnormal(); - } - - template<typename real_t, int size> - inline - realtestvec<real_t, size> ldexp(realtestvec<real_t, size> x, - typename inttestvec<real_t, size>::int_t n) - { - return x.ldexp(n); - } - - template<typename real_t, int size> - inline - realtestvec<real_t, size> ldexp(realtestvec<real_t, size> x, - inttestvec<real_t, size> n) - { - return x.ldexp(n); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> log(realtestvec<real_t, size> x) - { - return x.log(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> log10(realtestvec<real_t, size> x) - { - return x.log10(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> log1p(realtestvec<real_t, size> x) - { - return x.log1p(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> log2(realtestvec<real_t, size> x) - { - return x.log2(); - } - - template<typename real_t, int size> - inline inttestvec<real_t, size> lrint(realtestvec<real_t, size> x) - { - return x.lrint(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> mad(realtestvec<real_t, size> x, - realtestvec<real_t, size> y, - realtestvec<real_t, size> z) - { - return x.mad(y, z); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> nextafter(realtestvec<real_t, size> x, - realtestvec<real_t, size> y) - { - return x.nextafter(y); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> pow(realtestvec<real_t, size> x, - realtestvec<real_t, size> y) - { - return x.pow(y); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> rcp(realtestvec<real_t, size> x) - { - return x.rcp(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> remainder(realtestvec<real_t, size> x, - realtestvec<real_t, size> y) - { - return x.remainder(y); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> rint(realtestvec<real_t, size> x) - { - return x.rint(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> round(realtestvec<real_t, size> x) - { - return x.round(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> rsqrt(realtestvec<real_t, size> x) - { - return x.rsqrt(); - } - - template<typename real_t, int size> - inline booltestvec<real_t, size> signbit(realtestvec<real_t, size> x) - { - return x.signbit(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> sin(realtestvec<real_t, size> x) - { - return x.sin(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> sinh(realtestvec<real_t, size> x) - { - return x.sinh(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> sqrt(realtestvec<real_t, size> x) - { - return x.sqrt(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> tan(realtestvec<real_t, size> x) - { - return x.tan(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> tanh(realtestvec<real_t, size> x) - { - return x.tanh(); - } - - template<typename real_t, int size> - inline realtestvec<real_t, size> trunc(realtestvec<real_t, size> x) - { - return x.trunc(); - } - - - -#ifndef VML_NO_IOSTREAM - template<typename real_t, int size> - std::ostream& operator<<(std::ostream& os, - booltestvec<real_t, size> const& x) - { - os << "["; - for (int i=0; i<size; ++i) { - if (i!=0) os << ","; - os << x[i]; - } - os << "]"; - return os; - } - - template<typename real_t, int size> - std::ostream& operator<<(std::ostream& os, - inttestvec<real_t, size> const& x) - { - os << "["; - for (int i=0; i<size; ++i) { - if (i!=0) os << ","; - os << x[i]; - } - os << "]"; - return os; - } - - template<typename real_t, int size> - std::ostream& operator<<(std::ostream& os, - realtestvec<real_t, size> const& x) - { - os << "["; - for (int i=0; i<size; ++i) { - if (i!=0) os << ","; - os << x[i]; - } - os << "]"; - return os; + + realvec_t &operator+=(realvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] += x.v[d]; + return *this; + } + realvec_t &operator-=(realvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] -= x.v[d]; + return *this; + } + realvec_t &operator*=(realvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] *= x.v[d]; + return *this; + } + realvec_t &operator/=(realvec_t const &x) { + for (int d = 0; d < size; ++d) + v[d] /= x.v[d]; + return *this; } + + realvec_t operator+(realvec_t x) const { + realvec_t res = *this; + return res += x; + } + realvec_t operator-(realvec_t x) const { + realvec_t res = *this; + return res -= x; + } + realvec_t operator*(realvec_t x) const { + realvec_t res = *this; + return res *= x; + } + realvec_t operator/(realvec_t x) const { + realvec_t res = *this; + return res /= x; + } + + real_t maxval() const { + real_t res = v[0]; + for (int d = 1; d < size; ++d) + res = vml_std::fmax(res, v[d]); + return res; + } + real_t minval() const { + real_t res = v[0]; + for (int d = 1; d < size; ++d) + res = vml_std::fmin(res, v[d]); + return res; + } + real_t prod() const { + real_t res = v[0]; + for (int d = 1; d < size; ++d) + res *= v[d]; + return res; + } + real_t sum() const { + real_t res = v[0]; + for (int d = 1; d < size; ++d) + res += v[d]; + return res; + } + + boolvec_t operator==(realvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] == x.v[d]; + return res; + } + boolvec_t operator!=(realvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] != x.v[d]; + return res; + } + boolvec_t operator<(realvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] < x.v[d]; + return res; + } + boolvec_t operator<=(realvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] <= x.v[d]; + return res; + } + boolvec_t operator>(realvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] > x.v[d]; + return res; + } + boolvec_t operator>=(realvec_t const &x) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] >= x.v[d]; + return res; + } + + realvec_t acos() const { return MF::vml_acos(*this); } + realvec_t acosh() const { return MF::vml_acosh(*this); } + realvec_t asin() const { return MF::vml_asin(*this); } + realvec_t asinh() const { return MF::vml_asinh(*this); } + realvec_t atan() const { return MF::vml_atan(*this); } + realvec_t atan2(realvec_t y) const { return MF::vml_atan2(*this, y); } + realvec_t atanh() const { return MF::vml_atanh(*this); } + realvec_t cbrt() const { return MF::vml_cbrt(*this); } + realvec_t ceil() const { return MF::vml_ceil(*this); } + realvec_t copysign(realvec_t y) const { return MF::vml_copysign(*this, y); } + realvec_t cos() const { return MF::vml_cos(*this); } + realvec_t cosh() const { return MF::vml_cosh(*this); } + realvec_t exp() const { return MF::vml_exp(*this); } + realvec_t exp10() const { return MF::vml_exp10(*this); } + realvec_t exp2() const { return MF::vml_exp2(*this); } + realvec_t expm1() const { return MF::vml_expm1(*this); } + realvec_t fabs() const { return MF::vml_fabs(*this); } + realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); } + realvec_t floor() const { return MF::vml_floor(*this); } + realvec_t fma(realvec_t y, realvec_t z) const { + return MF::vml_fma(*this, y, z); + } + realvec_t fmax(realvec_t y) const { return MF::vml_fmax(*this, y); } + realvec_t fmin(realvec_t y) const { return MF::vml_fmin(*this, y); } + realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); } + realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); } + realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec_t ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec_t ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec_t log() const { return MF::vml_log(*this); } + realvec_t log10() const { return MF::vml_log10(*this); } + realvec_t log1p() const { return MF::vml_log1p(*this); } + realvec_t log2() const { return MF::vml_log2(*this); } + intvec_t lrint() const { return MF::vml_lrint(*this); } + realvec_t mad(realvec_t y, realvec_t z) const { + return MF::vml_mad(*this, y, z); + } + realvec_t nextafter(realvec_t y) const { return MF::vml_nextafter(*this, y); } + realvec_t pow(realvec_t y) const { return MF::vml_pow(*this, y); } + realvec_t rcp() const { return MF::vml_rcp(*this); } + realvec_t remainder(realvec_t y) const { return MF::vml_remainder(*this, y); } + realvec_t rint() const { return MF::vml_rint(*this); } + realvec_t round() const { return MF::vml_round(*this); } + realvec_t rsqrt() const { return MF::vml_rsqrt(*this); } + boolvec_t signbit() const { return MF::vml_signbit(*this); } + realvec_t sin() const { return MF::vml_sin(*this); } + realvec_t sinh() const { return MF::vml_sinh(*this); } + realvec_t sqrt() const { return MF::vml_sqrt(*this); } + realvec_t tan() const { return MF::vml_tan(*this); } + realvec_t tanh() const { return MF::vml_tanh(*this); } + realvec_t trunc() const { return MF::vml_trunc(*this); } +}; + +// booltestvec definitions + +template <typename T, int N> +inline typename booltestvec<T, N>::intvec_t booltestvec<T, N>::as_int() const { + return convert_int(); +} + +template <typename T, int N> +inline typename booltestvec<T, N>::intvec_t +booltestvec<T, N>::convert_int() const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d]; + return res; +} + +template <typename T, int N> +inline typename booltestvec<T, N>::boolvec_t +booltestvec<T, N>::ifthen(boolvec_t x, boolvec_t y) const { + boolvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] ? x.v[d] : y.v[d]; + return res; +} + +template <typename T, int N> +inline typename booltestvec<T, N>::intvec_t +booltestvec<T, N>::ifthen(intvec_t x, intvec_t y) const { + intvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] ? x.v[d] : y.v[d]; + return res; +} + +template <typename T, int N> +inline typename booltestvec<T, N>::realvec_t +booltestvec<T, N>::ifthen(realvec_t x, realvec_t y) const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = v[d] ? x.v[d] : y.v[d]; + return res; +} + +// inttestvec definitions + +template <typename T, int N> +inline typename inttestvec<T, N>::realvec_t inttestvec<T, N>::as_float() const { + realvec_t res; + for (int d = 0; d < size; ++d) + res.v[d] = FP::as_float(v[d]); + return res; +} + +template <typename T, int N> +inline typename inttestvec<T, N>::realvec_t +inttestvec<T, N>::convert_float() const { + return MF::vml_convert_float(*this); +} + +// Wrappers + +// booltestvec wrappers + +template <typename real_t, int size> +inline inttestvec<real_t, size> as_int(booltestvec<real_t, size> x) { + return x.as_int(); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> convert_int(booltestvec<real_t, size> x) { + return x.convert_int(); +} + +template <typename real_t, int size> +inline bool all(booltestvec<real_t, size> x) { + return x.all(); +} + +template <typename real_t, int size> +inline bool any(booltestvec<real_t, size> x) { + return x.any(); +} + +template <typename real_t, int size> +inline booltestvec<real_t, size> ifthen(booltestvec<real_t, size> c, + booltestvec<real_t, size> x, + booltestvec<real_t, size> y) { + return c.ifthen(x, y); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> ifthen(booltestvec<real_t, size> c, + inttestvec<real_t, size> x, + inttestvec<real_t, size> y) { + return c.ifthen(x, y); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> ifthen(booltestvec<real_t, size> c, + realtestvec<real_t, size> x, + realtestvec<real_t, size> y) { + return c.ifthen(x, y); +} + +// inttestvec wrappers + +template <typename real_t, int size> +inline inttestvec<real_t, size> abs(inttestvec<real_t, size> x) { + return x.abs(); +} + +template <typename real_t, int size> +inline booltestvec<real_t, size> as_bool(inttestvec<real_t, size> x) { + return x.as_bool(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> as_float(inttestvec<real_t, size> x) { + return x.as_float(); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> bitifthen(inttestvec<real_t, size> x, + inttestvec<real_t, size> y, + inttestvec<real_t, size> z) { + return x.bitifthen(y, z); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> clz(inttestvec<real_t, size> x) { + return x.clz(); +} + +template <typename real_t, int size> +inline booltestvec<real_t, size> convert_bool(inttestvec<real_t, size> x) { + return x.convert_bool(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> convert_float(inttestvec<real_t, size> x) { + return x.convert_float(); +} + +template <typename real_t, int size> +inline booltestvec<real_t, size> isignbit(inttestvec<real_t, size> x) { + return x.isignbit(); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> +lsr(inttestvec<real_t, size> x, typename inttestvec<real_t, size>::int_t n) { + return x.lsr(n); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> lsr(inttestvec<real_t, size> x, + inttestvec<real_t, size> n) { + return x.lsr(n); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> max(inttestvec<real_t, size> x, + inttestvec<real_t, size> y) { + return x.max(y); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> min(inttestvec<real_t, size> x, + inttestvec<real_t, size> y) { + return x.min(y); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> popcount(inttestvec<real_t, size> x) { + return x.popcount(); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> +rotate(inttestvec<real_t, size> x, typename inttestvec<real_t, size>::int_t n) { + return x.rotate(n); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> rotate(inttestvec<real_t, size> x, + inttestvec<real_t, size> n) { + return x.rotate(n); +} + +// realtestvec wrappers + +template <typename real_t, int size> +inline realtestvec<real_t, size> +loada(real_t const *p, realtestvec<real_t, size> x, + typename realtestvec<real_t, size>::mask_t const &m) { + return x.loada(p, m); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> +loadu(real_t const *p, realtestvec<real_t, size> x, + typename realtestvec<real_t, size>::mask_t const &m) { + return x.loadu(p, m); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> +loadu(real_t const *p, size_t ioff, realtestvec<real_t, size> x, + typename realtestvec<real_t, size>::mask_t const &m) { + return x.loadu(p, ioff, m); +} + +template <typename real_t, int size> +inline void storea(realtestvec<real_t, size> x, real_t *p) { + return x.storea(p); +} + +template <typename real_t, int size> +inline void storeu(realtestvec<real_t, size> x, real_t *p) { + return x.storeu(p); +} + +template <typename real_t, int size> +inline void storeu(realtestvec<real_t, size> x, real_t *p, size_t ioff) { + return x.storeu(p, ioff); +} + +template <typename real_t, int size> +inline void storea(realtestvec<real_t, size> x, real_t *p, + typename realtestvec<real_t, size>::mask_t const &m) { + return x.storea(p, m); +} + +template <typename real_t, int size> +inline void storeu(realtestvec<real_t, size> x, real_t *p, + typename realtestvec<real_t, size>::mask_t const &m) { + return x.storeu(p, m); +} + +template <typename real_t, int size> +inline void storeu(realtestvec<real_t, size> x, real_t *p, size_t ioff, + typename realtestvec<real_t, size>::mask_t const &m) { + return x.storeu(p, ioff, m); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> as_int(realtestvec<real_t, size> x) { + return x.as_int(); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> convert_int(realtestvec<real_t, size> x) { + return x.convert_int(); +} + +template <typename real_t, int size> +inline real_t maxval(realtestvec<real_t, size> x) { + return x.maxval(); +} + +template <typename real_t, int size> +inline real_t minval(realtestvec<real_t, size> x) { + return x.minval(); +} + +template <typename real_t, int size> +inline real_t prod(realtestvec<real_t, size> x) { + return x.prod(); +} + +template <typename real_t, int size> +inline real_t sum(realtestvec<real_t, size> x) { + return x.sum(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> acos(realtestvec<real_t, size> x) { + return x.acos(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> acosh(realtestvec<real_t, size> x) { + return x.acosh(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> asin(realtestvec<real_t, size> x) { + return x.asin(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> asinh(realtestvec<real_t, size> x) { + return x.asinh(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> atan(realtestvec<real_t, size> x) { + return x.atan(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> atan2(realtestvec<real_t, size> x, + realtestvec<real_t, size> y) { + return x.atan2(y); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> atanh(realtestvec<real_t, size> x) { + return x.atanh(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> cbrt(realtestvec<real_t, size> x) { + return x.cbrt(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> ceil(realtestvec<real_t, size> x) { + return x.ceil(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> copysign(realtestvec<real_t, size> x, + realtestvec<real_t, size> y) { + return x.copysign(y); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> cos(realtestvec<real_t, size> x) { + return x.cos(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> cosh(realtestvec<real_t, size> x) { + return x.cosh(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> exp(realtestvec<real_t, size> x) { + return x.exp(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> exp10(realtestvec<real_t, size> x) { + return x.exp10(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> exp2(realtestvec<real_t, size> x) { + return x.exp2(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> expm1(realtestvec<real_t, size> x) { + return x.expm1(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> fabs(realtestvec<real_t, size> x) { + return x.fabs(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> floor(realtestvec<real_t, size> x) { + return x.floor(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> fdim(realtestvec<real_t, size> x, + realtestvec<real_t, size> y) { + return x.fdim(y); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> fma(realtestvec<real_t, size> x, + realtestvec<real_t, size> y, + realtestvec<real_t, size> z) { + return x.fma(y, z); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> fmax(realtestvec<real_t, size> x, + realtestvec<real_t, size> y) { + return x.fmax(y); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> fmin(realtestvec<real_t, size> x, + realtestvec<real_t, size> y) { + return x.fmin(y); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> fmod(realtestvec<real_t, size> x, + realtestvec<real_t, size> y) { + return x.fmod(y); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> frexp(realtestvec<real_t, size> x, + inttestvec<real_t, size> *r) { + return x.frexp(r); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> hypot(realtestvec<real_t, size> x, + realtestvec<real_t, size> y) { + return x.hypot(y); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> ilogb(realtestvec<real_t, size> x) { + return x.ilogb(); +} + +template <typename real_t, int size> +inline booltestvec<real_t, size> isfinite(realtestvec<real_t, size> x) { + return x.isfinite(); +} + +template <typename real_t, int size> +inline booltestvec<real_t, size> isinf(realtestvec<real_t, size> x) { + return x.isinf(); +} + +template <typename real_t, int size> +inline booltestvec<real_t, size> isnan(realtestvec<real_t, size> x) { + return x.isnan(); +} + +template <typename real_t, int size> +inline booltestvec<real_t, size> isnormal(realtestvec<real_t, size> x) { + return x.isnormal(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> +ldexp(realtestvec<real_t, size> x, typename inttestvec<real_t, size>::int_t n) { + return x.ldexp(n); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> ldexp(realtestvec<real_t, size> x, + inttestvec<real_t, size> n) { + return x.ldexp(n); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> log(realtestvec<real_t, size> x) { + return x.log(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> log10(realtestvec<real_t, size> x) { + return x.log10(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> log1p(realtestvec<real_t, size> x) { + return x.log1p(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> log2(realtestvec<real_t, size> x) { + return x.log2(); +} + +template <typename real_t, int size> +inline inttestvec<real_t, size> lrint(realtestvec<real_t, size> x) { + return x.lrint(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> mad(realtestvec<real_t, size> x, + realtestvec<real_t, size> y, + realtestvec<real_t, size> z) { + return x.mad(y, z); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> nextafter(realtestvec<real_t, size> x, + realtestvec<real_t, size> y) { + return x.nextafter(y); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> pow(realtestvec<real_t, size> x, + realtestvec<real_t, size> y) { + return x.pow(y); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> rcp(realtestvec<real_t, size> x) { + return x.rcp(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> remainder(realtestvec<real_t, size> x, + realtestvec<real_t, size> y) { + return x.remainder(y); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> rint(realtestvec<real_t, size> x) { + return x.rint(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> round(realtestvec<real_t, size> x) { + return x.round(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> rsqrt(realtestvec<real_t, size> x) { + return x.rsqrt(); +} + +template <typename real_t, int size> +inline booltestvec<real_t, size> signbit(realtestvec<real_t, size> x) { + return x.signbit(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> sin(realtestvec<real_t, size> x) { + return x.sin(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> sinh(realtestvec<real_t, size> x) { + return x.sinh(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> sqrt(realtestvec<real_t, size> x) { + return x.sqrt(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> tan(realtestvec<real_t, size> x) { + return x.tan(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> tanh(realtestvec<real_t, size> x) { + return x.tanh(); +} + +template <typename real_t, int size> +inline realtestvec<real_t, size> trunc(realtestvec<real_t, size> x) { + return x.trunc(); +} + +#ifndef VML_NO_IOSTREAM +template <typename real_t, int size> +std::ostream &operator<<(std::ostream &os, booltestvec<real_t, size> const &x) { + os << "["; + for (int i = 0; i < size; ++i) { + if (i != 0) + os << ","; + os << x[i]; + } + os << "]"; + return os; +} + +template <typename real_t, int size> +std::ostream &operator<<(std::ostream &os, inttestvec<real_t, size> const &x) { + os << "["; + for (int i = 0; i < size; ++i) { + if (i != 0) + os << ","; + os << x[i]; + } + os << "]"; + return os; +} + +template <typename real_t, int size> +std::ostream &operator<<(std::ostream &os, realtestvec<real_t, size> const &x) { + os << "["; + for (int i = 0; i < size; ++i) { + if (i != 0) + os << ","; + os << x[i]; + } + os << "]"; + return os; +} #endif - + } // namespace vecmathlib -#endif // #ifndef VEC_TEST_H +#endif // #ifndef VEC_TEST_H diff --git a/vec_vsx_double2.h b/vec_vsx_double2.h index 6725859..fa43a6f 100644 --- a/vec_vsx_double2.h +++ b/vec_vsx_double2.h @@ -13,679 +13,572 @@ #include <altivec.h> #if defined __clang__ -# define __vector vector -# define __pixel pixel -# define __bool bool +#define __vector vector +#define __pixel pixel +#define __bool bool #elif defined __gcc__ -# undef vector -# undef pixel -# undef bool +#undef vector +#undef pixel +#undef bool #elif defined __xlC__ -# define __bool bool +#define __bool bool #else -# error "Unknown compiler" +#error "Unknown compiler" #endif - - namespace vecmathlib { - + #define VECMATHLIB_HAVE_VEC_DOUBLE_2 - template<> struct boolvec<double,2>; - template<> struct intvec<double,2>; - template<> struct realvec<double,2>; - - - - template<> - struct boolvec<double,2>: floatprops<double> - { - static int const size = 2; - typedef bool scalar_t; - typedef __vector __bool long long bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values are -1, false values are 0 - // truth values are interpreted bit-wise - static uint_t from_bool(bool a) { return -int_t(a); } - static bool to_bool(uint_t a) { return a; } - public: - - typedef boolvec boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v((bvector_t)vec_splats((unsigned long long)from_bool(a))) {} - boolvec(bool const* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - - operator bvector_t() const { return v; } - bool operator[](int n) const - { - return to_bool(vecmathlib::get_elt<BV,bvector_t,uint_t>(v, n)); - } - boolvec& set_elt(int n, bool a) - { - return - vecmathlib::set_elt<BV,bvector_t,uint_t>(v, n, from_bool(a)), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return vec_nor(v, v); } - - boolvec operator&&(boolvec x) const { return vec_and(v, x.v); } - boolvec operator||(boolvec x) const { return vec_or(v, x.v); } - boolvec operator==(boolvec x) const { return !(*this!=x); } - boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); } - - bool all() const { return vec_all_ne(v, BV(false)); } - bool any() const { return vec_any_ne(v, BV(false)); } - - - - // ifthen(condition, then-value, else-value) - boolvec_t ifthen(boolvec_t x, boolvec_t y) const; - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec<double,2>: floatprops<double> - { - static int const size = 2; - typedef int_t scalar_t; - typedef __vector signed long long ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec intvec_t; - typedef realvec<real_t, size> realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(vec_splats((long long)a)) {} - intvec(int_t const* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - static intvec iota() { return (__vector signed long long){0, 1}; } - - operator ivector_t() const { return v; } - int_t operator[](int n) const - { - return vecmathlib::get_elt<IV,ivector_t,int_t>(v, n); - } - intvec_t& set_elt(int n, int_t a) - { - return vecmathlib::set_elt<IV,ivector_t,int_t>(v, n, a), *this; - } - - - - // Vector casts do not change the bit battern - boolvec_t as_bool() const { return (__vector __bool long long)v; } - boolvec_t convert_bool() const { return *this != IV(I(0)); } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Permutation control words - private: - // 0123 4567 -> 1436 - // exchange pairs - static __vector unsigned char perm_int_swap() - { - return - (__vector unsigned char) - {4,5,6,7, 16,17,18,19, 12,13,14,15, 24,25,26,27}; - } - // 0123 4567 -> 0426 - // broadcast high elements of pairs - static __vector unsigned char perm_int_bchi() - { - return - (__vector unsigned char) - {0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27}; - } - public: - - - - intvec operator+() const { return *this; } - intvec operator-() const { return vec_neg(v); } - - intvec operator+(intvec x) const { return vec_add(v, x.v); } - intvec operator-(intvec x) const { return vec_sub(v, x.v); } - intvec operator*(intvec x) const { return vec_mul(v, x.v); } - intvec operator/(intvec x) const { return vec_div(v, x.v); } - intvec operator%(intvec x) const { return *this - *this / x * x; } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - intvec& operator*=(intvec const& x) { return *this=*this*x; } - intvec& operator/=(intvec const& x) { return *this=*this/x; } - intvec& operator%=(intvec const& x) { return *this=*this%x; } - - - - intvec operator~() const - { - return (__vector signed long long)vec_nor((__vector signed int)v, (__vector signed int)v); - } - - intvec operator&(intvec x) const - { - return (__vector signed long long)vec_and((__vector signed int)v, (__vector signed int)x.v); - } - intvec operator|(intvec x) const - { - return (__vector signed long long)vec_or ((__vector signed int)v, (__vector signed int)x.v); - } - intvec operator^(intvec x) const - { - return (__vector signed long long)vec_xor((__vector signed int)v, (__vector signed int)x.v); - } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - intvec_t bitifthen(intvec_t x, intvec_t y) const; - - - - intvec lsr(int_t n) const { return lsr(IV(n)); } - intvec_t rotate(int_t n) const; - intvec operator>>(int_t n) const { return *this >> IV(n); } - intvec operator<<(int_t n) const { return *this << IV(n); } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<<n; } - - intvec lsr(intvec n) const - { - // return vec_sr(v, (__vector unsigned long long)n.v); - intvec r; - for (int i=0; i<size; ++i) { - r.set_elt(i, U((*this)[i]) >> U(n[i])); - } - return r; - } - intvec_t rotate(intvec_t n) const; - intvec operator>>(intvec n) const - { - // return vec_sra(v, (__vector unsigned long long)n.v); - intvec r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] >> n[i]); - } - return r; - } - intvec operator<<(intvec n) const - { - // return vec_sl(v, (__vector unsigned long long)n.v); - intvec r; - for (int i=0; i<size; ++i) { - r.set_elt(i, (*this)[i] << n[i]); - } - return r; - } - intvec& operator>>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<<n; } - - intvec_t clz() const; - intvec_t popcount() const; - - - - boolvec_t operator==(intvec const& x) const - { - // return vec_cmpeq(v, x.v); - __vector signed int a = (__vector signed int)v; - __vector signed int b = (__vector signed int)x.v; - __vector __bool int c = vec_cmpeq(a, b); - __vector __bool int cx = vec_perm(c, c, perm_int_swap()); - __vector __bool int r = vec_and(c, cx); - return (__vector __bool long long)r; - } - boolvec_t operator!=(intvec const& x) const { return !(*this == x); } - boolvec_t operator<(intvec const& x) const - { - __vector signed int a = (__vector signed int)v; - __vector signed int b = (__vector signed int)x.v; - __vector __bool int lt = vec_cmplt(a, b); - __vector __bool int eq = vec_cmpeq(a, b); - __vector unsigned int ua = (__vector unsigned int)v; - __vector unsigned int ub = (__vector unsigned int)x.v; - __vector __bool int ult = vec_cmplt(ua, ub); - __vector __bool int ultx = vec_perm(ult, ult, perm_int_swap()); - __vector __bool int r = vec_or(lt, vec_and(eq, ultx)); - r = vec_perm(r, r, perm_int_bchi()); - return (__vector __bool long long)r; - } - boolvec_t operator<=(intvec const& x) const - { - return ! (*this > x); - } - boolvec_t operator>(intvec const& x) const - { - return x < *this; - } - boolvec_t operator>=(intvec const& x) const - { - return ! (*this < x); - } - - intvec_t abs() const; - boolvec_t isignbit() const { return (*this >> (bits-1)).as_bool(); } - intvec_t max(intvec_t x) const; - intvec_t min(intvec_t x) const; - }; - - - - template<> - struct realvec<double,2>: floatprops<double> - { - static int const size = 2; - typedef real_t scalar_t; - typedef __vector double vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return "<VSX:2*double>"; } - void barrier() { __asm__("": "+v"(v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec<real_t, size> boolvec_t; - typedef intvec<real_t, size> intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops<real_t> FP; - typedef mathfuncs<realvec_t> MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(vec_splats(a)) {} - realvec(real_t const* as) - { - for (int d=0; d<size; ++d) set_elt(d, as[d]); - } - - operator vector_t() const { return v; } - real_t operator[](int n) const - { - return vecmathlib::get_elt<RV,vector_t,real_t>(v, n); - } - realvec_t& set_elt(int n, real_t a) - { - return vecmathlib::set_elt<RV,vector_t,real_t>(v, n, a), *this; - } - - - - typedef vecmathlib::mask_t<realvec_t> mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return vec_xld2(0, (real_t*)p); - } - static realvec_t loadu(real_t const* p) - { - // TODO: Can this handle unaligned access? - return vec_xld2(0, (real_t*)p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - vec_xstd2(v, 0, p); - } - void storeu(real_t* p) const - { - // Vector stores would require vector loads, which would need to - // be atomic - // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> for good ideas - p[0] = (*this)[0]; - p[1] = (*this)[1]; - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - // Use vec_ste? - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - // Use vec_ste? - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return (__vector signed long long) v; } - intvec_t convert_int() const { return MF::vml_convert_int(*this); } - - - - realvec operator+() const { return *this; } - realvec operator-() const { return RV(0.0) - *this; } - - realvec operator+(realvec x) const { return vec_add(v, x.v); } - realvec operator-(realvec x) const { return vec_sub(v, x.v); } - realvec operator*(realvec x) const { return vec_mul(v, x.v); } - realvec operator/(realvec x) const { return vec_div(v, x.v); } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t maxval() const - { - return vml_std::fmax((*this)[0], (*this)[1]); +template <> struct boolvec<double, 2>; +template <> struct intvec<double, 2>; +template <> struct realvec<double, 2>; + +template <> struct boolvec<double, 2> : floatprops<double> { + static int const size = 2; + typedef bool scalar_t; + typedef __vector __bool long long bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + +private: + // true values are -1, false values are 0 + // truth values are interpreted bit-wise + static uint_t from_bool(bool a) { return -int_t(a); } + static bool to_bool(uint_t a) { return a; } + +public: + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x) : v(x) {} + boolvec(bool a) + : v((bvector_t)vec_splats((unsigned long long)from_bool(a))) {} + boolvec(bool const *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + + operator bvector_t() const { return v; } + bool operator[](int n) const { + return to_bool(vecmathlib::get_elt<BV, bvector_t, uint_t>(v, n)); + } + boolvec &set_elt(int n, bool a) { + return vecmathlib::set_elt<BV, bvector_t, uint_t>(v, n, from_bool(a)), + *this; + } + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + boolvec operator!() const { return vec_nor(v, v); } + + boolvec operator&&(boolvec x) const { return vec_and(v, x.v); } + boolvec operator||(boolvec x) const { return vec_or(v, x.v); } + boolvec operator==(boolvec x) const { return !(*this != x); } + boolvec operator!=(boolvec x) const { return vec_xor(v, x.v); } + + bool all() const { return vec_all_ne(v, BV(false)); } + bool any() const { return vec_any_ne(v, BV(false)); } + + // ifthen(condition, then-value, else-value) + boolvec_t ifthen(boolvec_t x, boolvec_t y) const; + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec +}; + +template <> struct intvec<double, 2> : floatprops<double> { + static int const size = 2; + typedef int_t scalar_t; + typedef __vector signed long long ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x) : v(x) {} + intvec(int_t a) : v(vec_splats((long long)a)) {} + intvec(int_t const *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + static intvec iota() { return (__vector signed long long){0, 1}; } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { + return vecmathlib::get_elt<IV, ivector_t, int_t>(v, n); + } + intvec_t &set_elt(int n, int_t a) { + return vecmathlib::set_elt<IV, ivector_t, int_t>(v, n, a), *this; + } + + // Vector casts do not change the bit battern + boolvec_t as_bool() const { return (__vector __bool long long)v; } + boolvec_t convert_bool() const { return *this != IV(I(0)); } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + // Permutation control words +private: + // 0123 4567 -> 1436 + // exchange pairs + static __vector unsigned char perm_int_swap() { + return (__vector unsigned char){4, 5, 6, 7, 16, 17, 18, 19, + 12, 13, 14, 15, 24, 25, 26, 27}; + } + // 0123 4567 -> 0426 + // broadcast high elements of pairs + static __vector unsigned char perm_int_bchi() { + return (__vector unsigned char){0, 1, 2, 3, 16, 17, 18, 19, + 8, 9, 10, 11, 24, 25, 26, 27}; + } + +public: + intvec operator+() const { return *this; } + intvec operator-() const { return vec_neg(v); } + + intvec operator+(intvec x) const { return vec_add(v, x.v); } + intvec operator-(intvec x) const { return vec_sub(v, x.v); } + intvec operator*(intvec x) const { return vec_mul(v, x.v); } + intvec operator/(intvec x) const { return vec_div(v, x.v); } + intvec operator%(intvec x) const { return *this - *this / x * x; } + + intvec &operator+=(intvec const &x) { return *this = *this + x; } + intvec &operator-=(intvec const &x) { return *this = *this - x; } + intvec &operator*=(intvec const &x) { return *this = *this * x; } + intvec &operator/=(intvec const &x) { return *this = *this / x; } + intvec &operator%=(intvec const &x) { return *this = *this % x; } + + intvec operator~() const { + return (__vector signed long long)vec_nor((__vector signed int)v, + (__vector signed int)v); + } + + intvec operator&(intvec x) const { + return (__vector signed long long)vec_and((__vector signed int)v, + (__vector signed int)x.v); + } + intvec operator|(intvec x) const { + return (__vector signed long long)vec_or((__vector signed int)v, + (__vector signed int)x.v); + } + intvec operator^(intvec x) const { + return (__vector signed long long)vec_xor((__vector signed int)v, + (__vector signed int)x.v); + } + + intvec &operator&=(intvec const &x) { return *this = *this & x; } + intvec &operator|=(intvec const &x) { return *this = *this | x; } + intvec &operator^=(intvec const &x) { return *this = *this ^ x; } + + intvec_t bitifthen(intvec_t x, intvec_t y) const; + + intvec lsr(int_t n) const { return lsr(IV(n)); } + intvec_t rotate(int_t n) const; + intvec operator>>(int_t n) const { return *this >> IV(n); } + intvec operator<<(int_t n) const { return *this << IV(n); } + intvec &operator>>=(int_t n) { return *this = *this >> n; } + intvec &operator<<=(int_t n) { return *this = *this << n; } + + intvec lsr(intvec n) const { + // return vec_sr(v, (__vector unsigned long long)n.v); + intvec r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, U((*this)[i]) >> U(n[i])); } - real_t minval() const - { - return vml_std::fmin((*this)[0], (*this)[1]); + return r; + } + intvec_t rotate(intvec_t n) const; + intvec operator>>(intvec n) const { + // return vec_sra(v, (__vector unsigned long long)n.v); + intvec r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] >> n[i]); + } + return r; + } + intvec operator<<(intvec n) const { + // return vec_sl(v, (__vector unsigned long long)n.v); + intvec r; + for (int i = 0; i < size; ++i) { + r.set_elt(i, (*this)[i] << n[i]); } - real_t prod() const - { - return (*this)[0] * (*this)[1]; + return r; + } + intvec &operator>>=(intvec n) { return *this = *this >> n; } + intvec &operator<<=(intvec n) { return *this = *this << n; } + + intvec_t clz() const; + intvec_t popcount() const; + + boolvec_t operator==(intvec const &x) const { + // return vec_cmpeq(v, x.v); + __vector signed int a = (__vector signed int)v; + __vector signed int b = (__vector signed int)x.v; + __vector __bool int c = vec_cmpeq(a, b); + __vector __bool int cx = vec_perm(c, c, perm_int_swap()); + __vector __bool int r = vec_and(c, cx); + return (__vector __bool long long)r; + } + boolvec_t operator!=(intvec const &x) const { return !(*this == x); } + boolvec_t operator<(intvec const &x) const { + __vector signed int a = (__vector signed int)v; + __vector signed int b = (__vector signed int)x.v; + __vector __bool int lt = vec_cmplt(a, b); + __vector __bool int eq = vec_cmpeq(a, b); + __vector unsigned int ua = (__vector unsigned int)v; + __vector unsigned int ub = (__vector unsigned int)x.v; + __vector __bool int ult = vec_cmplt(ua, ub); + __vector __bool int ultx = vec_perm(ult, ult, perm_int_swap()); + __vector __bool int r = vec_or(lt, vec_and(eq, ultx)); + r = vec_perm(r, r, perm_int_bchi()); + return (__vector __bool long long)r; + } + boolvec_t operator<=(intvec const &x) const { return !(*this > x); } + boolvec_t operator>(intvec const &x) const { return x < *this; } + boolvec_t operator>=(intvec const &x) const { return !(*this < x); } + + intvec_t abs() const; + boolvec_t isignbit() const { return (*this >> (bits - 1)).as_bool(); } + intvec_t max(intvec_t x) const; + intvec_t min(intvec_t x) const; +}; + +template <> struct realvec<double, 2> : floatprops<double> { + static int const size = 2; + typedef real_t scalar_t; + typedef __vector double vector_t; + static int const alignment = sizeof(vector_t); + + static char const *name() { return "<VSX:2*double>"; } + void barrier() { __asm__("" : "+v"(v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x) : v(x) {} + realvec(real_t a) : v(vec_splats(a)) {} + realvec(real_t const *as) { + for (int d = 0; d < size; ++d) + set_elt(d, as[d]); + } + + operator vector_t() const { return v; } + real_t operator[](int n) const { + return vecmathlib::get_elt<RV, vector_t, real_t>(v, n); + } + realvec_t &set_elt(int n, real_t a) { + return vecmathlib::set_elt<RV, vector_t, real_t>(v, n, a), *this; + } + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const *p) { + VML_ASSERT(intptr_t(p) % alignment == 0); + return vec_xld2(0, (real_t *)p); + } + static realvec_t loadu(real_t const *p) { + // TODO: Can this handle unaligned access? + return vec_xld2(0, (real_t *)p); + } + static realvec_t loadu(real_t const *p, std::ptrdiff_t ioff) { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff); + return loadu(p + ioff); + } + realvec_t loada(real_t const *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); } - real_t sum() const - { - return (*this)[0] + (*this)[1]; + } + realvec_t loadu(real_t const *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); } - - - - boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); } - boolvec_t operator!=(realvec const& x) const { return ! (*this == x); } - boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); } - boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); } - boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); } - boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const { return vec_ceil(v); } - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return vec_abs(v); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const { return vec_floor(v); } - realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); } - realvec fmax(realvec y) const { return vec_max(v, y.v); } - realvec fmin(realvec y) const { return vec_min(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec frexp(intvec_t* r) const { return MF::vml_frexp(*this, r); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return MF::vml_isnan(*this); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec_t mad(realvec_t y, realvec_t z) const - { - return MF::vml_mad(*this, y, z); + } + realvec_t loadu(real_t const *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return loada(p + ioff, m); + return loadu(p + ioff, m); + } + + void storea(real_t *p) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + vec_xstd2(v, 0, p); + } + void storeu(real_t *p) const { + // Vector stores would require vector loads, which would need to + // be atomic + // TODO: see <https://developer.apple.com/hardwaredrivers/ve/alignment.html> + // for good ideas + p[0] = (*this)[0]; + p[1] = (*this)[1]; + } + void storeu(real_t *p, std::ptrdiff_t ioff) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff); + storeu(p + ioff); + } + void storea(real_t *p, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + // Use vec_ste? + if (m.m[0]) + p[0] = (*this)[0]; + if (m.m[1]) + p[1] = (*this)[1]; } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const - { - realvec x = *this; - realvec r = vec_re(v); // this is only an approximation - // TODO: use fma - // Note: don't rewrite this expression, this may introduce - // cancellation errors - r += r * (RV(1.0) - x*r); // two Newton iterations (see vml_rcp) - r += r * (RV(1.0) - x*r); - return r; + } + void storeu(real_t *p, mask_t const &m) const { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + // Use vec_ste? + if (m.m[0]) + p[0] = (*this)[0]; + if (m.m[1]) + p[1] = (*this)[1]; } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const { return vec_round(v); /* sic! */} - realvec round() const { return MF::vml_round(*this); } - realvec rsqrt() const { return RV(1.0) / sqrt(); } - boolvec_t signbit() const { return MF::vml_signbit(*this); } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - realvec sqrt() const { return vec_sqrt(v); } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const { return vec_trunc(v); } - }; - - - - // boolvec definitions - - inline intvec<double,2> boolvec<double,2>::as_int() const - { - return (__vector signed long long) v; - } - - inline intvec<double,2> boolvec<double,2>::convert_int() const - { - return -(__vector signed long long)v; - } - - inline - boolvec<double,2> boolvec<double,2>::ifthen(boolvec_t x, boolvec_t y) const - { - return vec_sel(y.v, x.v, v); - } - - inline - intvec<double,2> boolvec<double,2>::ifthen(intvec_t x, intvec_t y) const - { - return vec_sel(y.v, x.v, v); - } - - inline - realvec<double,2> boolvec<double,2>::ifthen(realvec_t x, realvec_t y) const - { - return vec_sel(y.v, x.v, v); - } - - - - // intvec definitions - - inline intvec<double,2> intvec<double,2>::abs() const - { - return MF::vml_abs(*this); - } - - inline realvec<double,2> intvec<double,2>::as_float() const - { - return (__vector double)v; - } - - inline intvec<double,2> intvec<double,2>::bitifthen(intvec_t x, - intvec_t y) const - { - return MF::vml_bitifthen(*this, x, y); - } - - inline intvec<double,2> intvec<double,2>::clz() const - { - return MF::vml_clz(*this); - } - - inline realvec<double,2> intvec<double,2>::convert_float() const - { - // return vec_ctd(v, 0); - return MF::vml_convert_float(*this); - } - - inline intvec<double,2> intvec<double,2>::max(intvec_t x) const - { - return MF::vml_max(*this, x); - } - - inline intvec<double,2> intvec<double,2>::min(intvec_t x) const - { - return MF::vml_min(*this, x); - } - - inline intvec<double,2> intvec<double,2>::popcount() const - { - return MF::vml_popcount(*this); - } - - inline intvec<double,2> intvec<double,2>::rotate(int_t n) const - { - return MF::vml_rotate(*this, n); - } - - inline intvec<double,2> intvec<double,2>::rotate(intvec_t n) const - { - return MF::vml_rotate(*this, n); - } - + } + void storeu(real_t *p, std::ptrdiff_t ioff, mask_t const &m) const { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) + return storea(p + ioff, m); + storeu(p + ioff, m); + } + + intvec_t as_int() const { return (__vector signed long long)v; } + intvec_t convert_int() const { return MF::vml_convert_int(*this); } + + realvec operator+() const { return *this; } + realvec operator-() const { return RV(0.0) - *this; } + + realvec operator+(realvec x) const { return vec_add(v, x.v); } + realvec operator-(realvec x) const { return vec_sub(v, x.v); } + realvec operator*(realvec x) const { return vec_mul(v, x.v); } + realvec operator/(realvec x) const { return vec_div(v, x.v); } + + realvec &operator+=(realvec const &x) { return *this = *this + x; } + realvec &operator-=(realvec const &x) { return *this = *this - x; } + realvec &operator*=(realvec const &x) { return *this = *this * x; } + realvec &operator/=(realvec const &x) { return *this = *this / x; } + + real_t maxval() const { return vml_std::fmax((*this)[0], (*this)[1]); } + real_t minval() const { return vml_std::fmin((*this)[0], (*this)[1]); } + real_t prod() const { return (*this)[0] * (*this)[1]; } + real_t sum() const { return (*this)[0] + (*this)[1]; } + + boolvec_t operator==(realvec const &x) const { return vec_cmpeq(v, x.v); } + boolvec_t operator!=(realvec const &x) const { return !(*this == x); } + boolvec_t operator<(realvec const &x) const { return vec_cmplt(v, x.v); } + boolvec_t operator<=(realvec const &x) const { return vec_cmple(v, x.v); } + boolvec_t operator>(realvec const &x) const { return vec_cmpgt(v, x.v); } + boolvec_t operator>=(realvec const &x) const { return vec_cmpge(v, x.v); } + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const { return vec_ceil(v); } + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return vec_abs(v); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const { return vec_floor(v); } + realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); } + realvec fmax(realvec y) const { return vec_max(v, y.v); } + realvec fmin(realvec y) const { return vec_min(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec_t mad(realvec_t y, realvec_t z) const { + return MF::vml_mad(*this, y, z); + } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const { + realvec x = *this; + realvec r = vec_re(v); // this is only an approximation + // TODO: use fma + // Note: don't rewrite this expression, this may introduce + // cancellation errors + r += r * (RV(1.0) - x * r); // two Newton iterations (see vml_rcp) + r += r * (RV(1.0) - x * r); + return r; + } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const { return vec_round(v); /* sic! */ } + realvec round() const { return MF::vml_round(*this); } + realvec rsqrt() const { return RV(1.0) / sqrt(); } + boolvec_t signbit() const { return MF::vml_signbit(*this); } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + realvec sqrt() const { return vec_sqrt(v); } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const { return vec_trunc(v); } +}; + +// boolvec definitions + +inline intvec<double, 2> boolvec<double, 2>::as_int() const { + return (__vector signed long long)v; +} + +inline intvec<double, 2> boolvec<double, 2>::convert_int() const { + return -(__vector signed long long)v; +} + +inline boolvec<double, 2> boolvec<double, 2>::ifthen(boolvec_t x, + boolvec_t y) const { + return vec_sel(y.v, x.v, v); +} + +inline intvec<double, 2> boolvec<double, 2>::ifthen(intvec_t x, + intvec_t y) const { + return vec_sel(y.v, x.v, v); +} + +inline realvec<double, 2> boolvec<double, 2>::ifthen(realvec_t x, + realvec_t y) const { + return vec_sel(y.v, x.v, v); +} + +// intvec definitions + +inline intvec<double, 2> intvec<double, 2>::abs() const { + return MF::vml_abs(*this); +} + +inline realvec<double, 2> intvec<double, 2>::as_float() const { + return (__vector double)v; +} + +inline intvec<double, 2> intvec<double, 2>::bitifthen(intvec_t x, + intvec_t y) const { + return MF::vml_bitifthen(*this, x, y); +} + +inline intvec<double, 2> intvec<double, 2>::clz() const { + return MF::vml_clz(*this); +} + +inline realvec<double, 2> intvec<double, 2>::convert_float() const { + // return vec_ctd(v, 0); + return MF::vml_convert_float(*this); +} + +inline intvec<double, 2> intvec<double, 2>::max(intvec_t x) const { + return MF::vml_max(*this, x); +} + +inline intvec<double, 2> intvec<double, 2>::min(intvec_t x) const { + return MF::vml_min(*this, x); +} + +inline intvec<double, 2> intvec<double, 2>::popcount() const { + return MF::vml_popcount(*this); +} + +inline intvec<double, 2> intvec<double, 2>::rotate(int_t n) const { + return MF::vml_rotate(*this, n); +} + +inline intvec<double, 2> intvec<double, 2>::rotate(intvec_t n) const { + return MF::vml_rotate(*this, n); +} + } // namespace vecmathlib -#endif // #ifndef VEC_VSX_DOUBLE2_H +#endif // #ifndef VEC_VSX_DOUBLE2_H diff --git a/vecmathlib.h b/vecmathlib.h index 9accd24..0d72add 100644 --- a/vecmathlib.h +++ b/vecmathlib.h @@ -4,16 +4,14 @@ #define VECMATHLIB_H #if defined VML_DEBUG || defined VML_NODEBUG -# if defined VML_DEBUG && defined VML_NODEBUG -# error "Only one of VML_DEBUG or VML_NODEBUG may be defined" -# endif +#if defined VML_DEBUG && defined VML_NODEBUG +#error "Only one of VML_DEBUG or VML_NODEBUG may be defined" +#endif #else // default -# define VML_DEBUG +#define VML_DEBUG #endif - - // FP settings // Possible effects of not having VML_HAVE_FP_CONTRACT: @@ -23,7 +21,7 @@ // - can evaluate functions with reduced precision (80% of significant digits) // default settings -#undef VML_HAVE_DENORMALS // TODO +#undef VML_HAVE_DENORMALS // TODO #define VML_HAVE_FP_CONTRACT #define VML_HAVE_INF #define VML_HAVE_NAN @@ -31,63 +29,59 @@ // optimized settings #ifdef __FAST_MATH__ -# undef VML_HAVE_DENORMALS -# undef VML_HAVE_FP_CONTRACT -# undef VML_HAVE_INF -# undef VML_HAVE_NAN +#undef VML_HAVE_DENORMALS +#undef VML_HAVE_FP_CONTRACT +#undef VML_HAVE_INF +#undef VML_HAVE_NAN #endif #ifdef VML_DEBUG -# define VML_CONFIG_DEBUG " debug" +#define VML_CONFIG_DEBUG " debug" #else -# define VML_CONFIG_DEBUG " no-debug" +#define VML_CONFIG_DEBUG " no-debug" #endif #ifdef VML_DENORMALS -# define VML_CONFIG_DENORMALS " denormals" +#define VML_CONFIG_DENORMALS " denormals" #else -# define VML_CONFIG_DENORMALS " no-denormals" +#define VML_CONFIG_DENORMALS " no-denormals" #endif #ifdef VML_FP_CONTRACT -# define VML_CONFIG_FP_CONTRACT " fp-contract" +#define VML_CONFIG_FP_CONTRACT " fp-contract" #else -# define VML_CONFIG_FP_CONTRACT " no-fp-contract" +#define VML_CONFIG_FP_CONTRACT " no-fp-contract" #endif #ifdef VML_INF -# define VML_CONFIG_INF " inf" +#define VML_CONFIG_INF " inf" #else -# define VML_CONFIG_INF " no-inf" +#define VML_CONFIG_INF " no-inf" #endif #ifdef VML_NAN -# define VML_CONFIG_NAN " nan" +#define VML_CONFIG_NAN " nan" #else -# define VML_CONFIG_NAN " no-nan" +#define VML_CONFIG_NAN " no-nan" #endif // TODO: introduce mad, as fast version of fma (check FP_FAST_FMA) // TODO: introduce ieee_isnan and friends // TODO: switch between isnan and ieee_isnan at an outside level - - // This workaround is needed for older libstdc++ versions such as the // one in Debian 6.0 when compiled with clang++ // <http://lists.cs.uiuc.edu/pipermail/cfe-dev/2011-February/013207.html>. // The version time stamp used below is the one in Debian 6.0. -#include <cstring> // pull in __GLIBCXX__ +#include <cstring> // pull in __GLIBCXX__ #if defined __GLIBCXX__ && __GLIBCXX__ <= 20101114 -namespace std { class type_info; } +namespace std { +class type_info; +} #endif - - #include <cassert> - - #ifdef VML_DEBUG -# define VML_ASSERT(x) assert(x) +#define VML_ASSERT(x) assert(x) #else -# define VML_ASSERT(x) ((void)0) +#define VML_ASSERT(x) ((void)0) #endif // Scalarise all vector operations, and use libm's functions (mostly @@ -96,146 +90,142 @@ namespace std { class type_info; } #ifdef __clang__ // Use compiler-provided vector types -# include "vec_builtin.h" +#include "vec_builtin.h" #endif // Scalarise all vector operations; don't use libm, use only // Vecmathlib's functions (mostly useful for testing Vecmathlib) #include "vec_test.h" -#if defined __ARM_NEON__ // ARM NEON -# include "vec_neon_float2.h" -# include "vec_neon_float4.h" -# define VML_CONFIG_NEON " NEON" -#else -# define VML_CONFIG_NEON -#endif - -#if defined __SSE2__ // Intel SSE 2 -# include "vec_sse_float1.h" -# include "vec_sse_float4.h" -# include "vec_sse_double1.h" -# include "vec_sse_double2.h" -# if defined __SSE3__ -# define VML_CONFIG_SSE3 " SSE3" -# else -# define VML_CONFIG_SSE3 -# endif -# if defined __SSSE3__ -# define VML_CONFIG_SSSE3 " SSSE3" -# else -# define VML_CONFIG_SSSE3 -# endif -# if defined __SSE4_1__ -# define VML_CONFIG_SSE4_1 " SSE4.1" -# else -# define VML_CONFIG_SSE4_1 -# endif -# if defined __SSE4a__ -# define VML_CONFIG_SSE4a " SSE4a" -# else -# define VML_CONFIG_SSE4a -# endif -# define VML_CONFIG_SSE2 " SSE2" VML_CONFIG_SSE3 VML_CONFIG_SSSE3 VML_CONFIG_SSE4_1 VML_CONFIG_SSE4a -#else -# define VML_CONFIG_SSE2 -#endif - -#if defined __AVX__ // Intel AVX -# include "vec_avx_fp8_32.h" -# include "vec_avx_fp16_16.h" -# include "vec_avx_float8.h" -# include "vec_avx_double4.h" -# define VML_CONFIG_AVX " AVX" -#else -# define VML_CONFIG_AVX -#endif - -#if defined __MIC__ // Intel MIC +#if defined __ARM_NEON__ // ARM NEON +#include "vec_neon_float2.h" +#include "vec_neon_float4.h" +#define VML_CONFIG_NEON " NEON" +#else +#define VML_CONFIG_NEON +#endif + +#if defined __SSE2__ // Intel SSE 2 +#include "vec_sse_float1.h" +#include "vec_sse_float4.h" +#include "vec_sse_double1.h" +#include "vec_sse_double2.h" +#if defined __SSE3__ +#define VML_CONFIG_SSE3 " SSE3" +#else +#define VML_CONFIG_SSE3 +#endif +#if defined __SSSE3__ +#define VML_CONFIG_SSSE3 " SSSE3" +#else +#define VML_CONFIG_SSSE3 +#endif +#if defined __SSE4_1__ +#define VML_CONFIG_SSE4_1 " SSE4.1" +#else +#define VML_CONFIG_SSE4_1 +#endif +#if defined __SSE4a__ +#define VML_CONFIG_SSE4a " SSE4a" +#else +#define VML_CONFIG_SSE4a +#endif +#define VML_CONFIG_SSE2 \ + " SSE2" VML_CONFIG_SSE3 VML_CONFIG_SSSE3 VML_CONFIG_SSE4_1 VML_CONFIG_SSE4a +#else +#define VML_CONFIG_SSE2 +#endif + +#if defined __AVX__ // Intel AVX +#include "vec_avx_fp8_32.h" +#include "vec_avx_fp16_16.h" +#include "vec_avx_float8.h" +#include "vec_avx_double4.h" +#define VML_CONFIG_AVX " AVX" +#else +#define VML_CONFIG_AVX +#endif + +#if defined __MIC__ // Intel MIC // TODO: single precision? -# include "vec_mic_double8.h" -# define VML_CONFIG_MIC " MIC" +#include "vec_mic_double8.h" +#define VML_CONFIG_MIC " MIC" #else -# define VML_CONFIG_MIC +#define VML_CONFIG_MIC #endif -#if defined __ALTIVEC__ // IBM Altivec -# include "vec_altivec_float4.h" -# define VML_CONFIG_ALTIVEC " Altivec" +#if defined __ALTIVEC__ // IBM Altivec +#include "vec_altivec_float4.h" +#define VML_CONFIG_ALTIVEC " Altivec" #else -# define VML_CONFIG_ALTIVEC +#define VML_CONFIG_ALTIVEC #endif #if defined __ALTIVEC__ && defined _ARCH_PWR7 // IBM VSX -# include "vec_vsx_double2.h" -# define VML_CONFIG_VSX " VSX" +#include "vec_vsx_double2.h" +#define VML_CONFIG_VSX " VSX" #else -# define VML_CONFIG_VSX +#define VML_CONFIG_VSX #endif // TODO: IBM Blue Gene/P DoubleHummer #if defined __bgq__ && defined __VECTOR4DOUBLE__ // IBM Blue Gene/Q QPX // TODO: vec_qpx_float4 -# include "vec_qpx_double4.h" -# define VML_CONFIG_QPX " QPX" +#include "vec_qpx_double4.h" +#define VML_CONFIG_QPX " QPX" #else -# define VML_CONFIG_QPX +#define VML_CONFIG_QPX #endif -#define VECMATHLIB_CONFIGURATION \ - "VecmathlibConfiguration" \ - VML_CONFIG_DEBUG \ - VML_CONFIG_DENORMALS VML_CONFIG_FP_CONTRACT VML_CONFIG_INF VML_CONFIG_NAN \ - VML_CONFIG_NEON \ - VML_CONFIG_SSE2 VML_CONFIG_AVX VML_CONFIG_MIC \ - VML_CONFIG_ALTIVEC VML_CONFIG_VSX \ - VML_CONFIG_QPX - - +#define VECMATHLIB_CONFIGURATION \ + "VecmathlibConfiguration" VML_CONFIG_DEBUG VML_CONFIG_DENORMALS \ + VML_CONFIG_FP_CONTRACT VML_CONFIG_INF VML_CONFIG_NAN VML_CONFIG_NEON \ + VML_CONFIG_SSE2 VML_CONFIG_AVX VML_CONFIG_MIC VML_CONFIG_ALTIVEC \ + VML_CONFIG_VSX VML_CONFIG_QPX // Define "best" vector types namespace vecmathlib { - + #if defined VECMATHLIB_HAVE_VEC_FLOAT_16 -# define VECMATHLIB_MAX_FLOAT_VECSIZE 16 +#define VECMATHLIB_MAX_FLOAT_VECSIZE 16 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_8 -# define VECMATHLIB_MAX_FLOAT_VECSIZE 8 +#define VECMATHLIB_MAX_FLOAT_VECSIZE 8 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_4 -# define VECMATHLIB_MAX_FLOAT_VECSIZE 4 +#define VECMATHLIB_MAX_FLOAT_VECSIZE 4 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_2 -# define VECMATHLIB_MAX_FLOAT_VECSIZE 2 +#define VECMATHLIB_MAX_FLOAT_VECSIZE 2 #elif defined VECMATHLIB_HAVE_VEC_FLOAT_1 -# define VECMATHLIB_MAX_FLOAT_VECSIZE 1 +#define VECMATHLIB_MAX_FLOAT_VECSIZE 1 #endif - + #if defined VECMATHLIB_HAVE_VEC_DOUBLE_8 -# define VECMATHLIB_MAX_DOUBLE_VECSIZE 8 +#define VECMATHLIB_MAX_DOUBLE_VECSIZE 8 #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_4 -# define VECMATHLIB_MAX_DOUBLE_VECSIZE 4 +#define VECMATHLIB_MAX_DOUBLE_VECSIZE 4 #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2 -# define VECMATHLIB_MAX_DOUBLE_VECSIZE 2 +#define VECMATHLIB_MAX_DOUBLE_VECSIZE 2 #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_1 -# define VECMATHLIB_MAX_DOUBLE_VECSIZE 1 +#define VECMATHLIB_MAX_DOUBLE_VECSIZE 1 #endif - + #ifdef VECMATHLIB_MAX_FLOAT_VECSIZE - typedef realvec<float,VECMATHLIB_MAX_FLOAT_VECSIZE> float32_vec; - typedef intvec<float,VECMATHLIB_MAX_FLOAT_VECSIZE> int32_vec; - typedef boolvec<float,VECMATHLIB_MAX_FLOAT_VECSIZE> bool32_vec; +typedef realvec<float, VECMATHLIB_MAX_FLOAT_VECSIZE> float32_vec; +typedef intvec<float, VECMATHLIB_MAX_FLOAT_VECSIZE> int32_vec; +typedef boolvec<float, VECMATHLIB_MAX_FLOAT_VECSIZE> bool32_vec; #else - typedef realpseudovec<float,1> float32_vec; - typedef intpseudovec<float,1> int32_vec; - typedef boolpseudovec<float,1> bool32_vec; +typedef realpseudovec<float, 1> float32_vec; +typedef intpseudovec<float, 1> int32_vec; +typedef boolpseudovec<float, 1> bool32_vec; #endif - + #ifdef VECMATHLIB_MAX_DOUBLE_VECSIZE - typedef realvec<double,VECMATHLIB_MAX_DOUBLE_VECSIZE> float64_vec; - typedef intvec<double,VECMATHLIB_MAX_DOUBLE_VECSIZE> int64_vec; - typedef boolvec<double,VECMATHLIB_MAX_DOUBLE_VECSIZE> bool64_vec; +typedef realvec<double, VECMATHLIB_MAX_DOUBLE_VECSIZE> float64_vec; +typedef intvec<double, VECMATHLIB_MAX_DOUBLE_VECSIZE> int64_vec; +typedef boolvec<double, VECMATHLIB_MAX_DOUBLE_VECSIZE> bool64_vec; #else - typedef realpseudovec<double,1> float64_vec; - typedef intpseudovec<double,1> int64_vec; - typedef boolpseudovec<double,1> bool64_vec; +typedef realpseudovec<double, 1> float64_vec; +typedef intpseudovec<double, 1> int64_vec; +typedef boolpseudovec<double, 1> bool64_vec; #endif } |