From ab9a95904b3982859cf3a54e13a9896e7c719f5f Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Mon, 17 Jun 2013 14:37:44 -0400 Subject: Rename implementation files to indicate vector size --- vec_altivec_float4.h | 553 +++++++++++++++++++++++++++++++++++++++ vec_avx_double4.h | 643 +++++++++++++++++++++++++++++++++++++++++++++ vec_avx_float8.h | 646 +++++++++++++++++++++++++++++++++++++++++++++ vec_avx_fp16_16.h | 582 +++++++++++++++++++++++++++++++++++++++++ vec_avx_fp8_32.h | 648 +++++++++++++++++++++++++++++++++++++++++++++ vec_double_avx.h | 643 --------------------------------------------- vec_double_qpx.h | 667 ----------------------------------------------- vec_double_sse2.h | 646 --------------------------------------------- vec_double_sse2_scalar.h | 528 ------------------------------------- vec_double_vsx.h | 656 ---------------------------------------------- vec_float_altivec.h | 553 --------------------------------------- vec_float_avx.h | 646 --------------------------------------------- vec_float_neon.h | 558 --------------------------------------- vec_float_sse2.h | 651 --------------------------------------------- vec_float_sse2_scalar.h | 523 ------------------------------------- vec_fp16_avx.h | 582 ----------------------------------------- vec_fp8_avx.h | 648 --------------------------------------------- vec_neon_float2.h | 558 +++++++++++++++++++++++++++++++++++++++ vec_qpx_double4.h | 667 +++++++++++++++++++++++++++++++++++++++++++++++ vec_sse_double1.h | 528 +++++++++++++++++++++++++++++++++++++ vec_sse_double2.h | 646 +++++++++++++++++++++++++++++++++++++++++++++ vec_sse_float1.h | 523 +++++++++++++++++++++++++++++++++++++ vec_sse_float4.h | 651 +++++++++++++++++++++++++++++++++++++++++++++ vec_vsx_double2.h | 656 ++++++++++++++++++++++++++++++++++++++++++++++ vecmathlib.h | 29 ++- 25 files changed, 7317 insertions(+), 7314 deletions(-) create mode 100644 vec_altivec_float4.h create mode 100644 vec_avx_double4.h create mode 100644 vec_avx_float8.h create mode 100644 vec_avx_fp16_16.h create mode 100644 vec_avx_fp8_32.h delete mode 100644 vec_double_avx.h delete mode 100644 vec_double_qpx.h delete mode 100644 vec_double_sse2.h delete mode 100644 vec_double_sse2_scalar.h delete mode 100644 vec_double_vsx.h delete mode 100644 vec_float_altivec.h delete mode 100644 vec_float_avx.h delete mode 100644 vec_float_neon.h delete mode 100644 vec_float_sse2.h delete mode 100644 vec_float_sse2_scalar.h delete mode 100644 vec_fp16_avx.h delete mode 100644 vec_fp8_avx.h create mode 100644 vec_neon_float2.h create mode 100644 vec_qpx_double4.h create mode 100644 vec_sse_double1.h create mode 100644 vec_sse_double2.h create mode 100644 vec_sse_float1.h create mode 100644 vec_sse_float4.h create mode 100644 vec_vsx_double2.h diff --git a/vec_altivec_float4.h b/vec_altivec_float4.h new file mode 100644 index 0000000..813141e --- /dev/null +++ b/vec_altivec_float4.h @@ -0,0 +1,553 @@ +// -*-C++-*- + +#ifndef VEC_ALTIVEC_FLOAT4_H +#define VEC_ALTIVEC_FLOAT4_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include + +// Altivec intrinsics +#include +#undef vector +#undef pixel +#undef bool + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_FLOAT_4 + template<> struct boolvec; + template<> struct intvec; + template<> struct realvec; + + + + template<> + struct boolvec: floatprops + { + static int const size = 4; + typedef bool scalar_t; + typedef __vector __bool int bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + private: + // true values are -1, false values are 0 + static uint_t from_bool(bool a) { return -int_t(a); } + static bool to_bool(uint_t a) { return a; } + public: + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): v(vec_splats(from_bool(a))) {} + boolvec(bool const* as) + { + for (int d=0; d + struct intvec: floatprops + { + static int const size = 4; + typedef int_t scalar_t; + typedef __vector int ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x): v(x) {} + intvec(int_t a): v(vec_splats(a)) {} + intvec(int_t const* as) + { + for (int d=0; d>(int_t n) const { return *this >> IV(n); } + intvec operator<<(int_t n) const { return *this << IV(n); } + intvec& operator>>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<>(intvec n) const + { + return vec_sra(v, (__vector unsigned int)n.v); + } + intvec operator<<(intvec n) const + { + return vec_sl(v, (__vector unsigned int)n.v); + } + intvec& operator>>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this< x); } + boolvec_t operator>(intvec const& x) const { return vec_cmpgt(v, x.v); } + boolvec_t operator>=(intvec const& x) const { return !(*this < x); } + }; + + + + template<> + struct realvec: floatprops + { + static int const size = 4; + typedef real_t scalar_t; + typedef __vector float vector_t; + static int const alignment = sizeof(vector_t); + + static char const* name() { return ""; } + void barrier() { __asm__("": "+v" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x): v(x) {} + realvec(real_t a): v(vec_splats(a)) {} + realvec(real_t const* as) + { + for (int d=0; d mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return vec_ld(0, p); + } + static realvec_t loadu(real_t const* p) + { + realvec_t v0 = vec_ld(0, p); + realvec_t v1 = vec_ld(15, p); + return vec_perm(v0.v, v1.v, vec_lvsl(0, p)); + } + static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + vec_st(v, 0, p); + } + void storeu(real_t* p) const + { + // Vector stores would require vector loads, which would need to + // be atomic + // TODO: see for good ideas + p[0] = (*this)[0]; + p[1] = (*this)[1]; + p[2] = (*this)[2]; + p[3] = (*this)[3]; + } + void storeu(real_t* p, std::ptrdiff_t ioff) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + // Use vec_ste? + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + if (m.m[2]) p[2] = (*this)[2]; + if (m.m[3]) p[3] = (*this)[3]; + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + // Use vec_ste? + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + if (m.m[2]) p[2] = (*this)[2]; + if (m.m[3]) p[3] = (*this)[3]; + } + } + void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff, m); + storeu(p+ioff, m); + } + + + + intvec_t as_int() const { return (__vector int) v; } + intvec_t convert_int() const { return vec_cts(v, 0); } + + + + realvec operator+() const { return *this; } + realvec operator-() const { return RV(0.0) - *this; } + + realvec operator+(realvec x) const { return vec_add(v, x.v); } + realvec operator-(realvec x) const { return vec_sub(v, x.v); } + realvec operator*(realvec x) const { +#if defined __VSX__ + return vec_mul(v, x.v); +#else + return vec_madd(v, x.v, RV(0.0).v); +#endif + } + realvec operator/(realvec x) const { +#if defined __VSX__ + return vec_div(v, x.v); +#else + return *this * x.rcp(); +#endif + } + + realvec& operator+=(realvec const& x) { return *this=*this+x; } + realvec& operator-=(realvec const& x) { return *this=*this-x; } + realvec& operator*=(realvec const& x) { return *this=*this*x; } + realvec& operator/=(realvec const& x) { return *this=*this/x; } + + real_t prod() const + { + return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; + } + real_t sum() const + { + return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; + } + + + + boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); } + boolvec_t operator!=(realvec const& x) const { return ! (*this == x); } + boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); } + boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); } + boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); } + boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); } + + + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const { return vec_ceil(v); } + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return vec_abs(v); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const { return vec_floor(v); } + realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); } + realvec fmax(realvec y) const { return vec_max(v, y.v); } + realvec fmin(realvec y) const { return vec_min(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const + { + realvec x = *this; + realvec r = vec_re(v); // this is only an approximation + // TODO: use fma + // Note: don't rewrite this expression, this may introduce + // cancellation errors + r += r * (RV(1.0) - x*r); // one Newton iteration (see vml_rcp) + return r; + } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const { return vec_round(v); } + realvec round() const { return MF::vml_round(*this); } + realvec rsqrt() const + { +#if defined __VSX__ + return vec_rsqrt(v); +#else + realvec x = *this; + realvec r = vec_rsqrte(x.v); // this is only an approximation + // TODO: use fma + // one Newton iteration (see vml_rsqrt) + r += RV(0.5)*r * (RV(1.0) - x * r*r); + return r; +#endif + } + boolvec_t signbit() const { return MF::vml_signbit(*this); } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + realvec sqrt() const { +#if defined __VSX__ + return vec_sqrt(v); +#else + return *this * rsqrt(); +#endif + } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const { return vec_trunc(v); } + }; + + + + // boolvec definitions + + inline + auto boolvec::as_int() const -> intvec_t + { + return (__vector int) v; + } + + inline + auto boolvec::convert_int() const -> intvec_t + { + return -(__vector int)v; + } + + inline + auto boolvec::operator==(boolvec x) const -> boolvec_t + { + return as_int() == x.as_int(); + } + + inline + auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return vec_sel(y.v, x.v, v); + } + + inline + auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { + return vec_sel(y.v, x.v, v); + } + + + + // intvec definitions + + inline auto intvec::as_float() const -> realvec_t + { + return (__vector float)v; + } + + inline auto intvec::convert_float() const -> realvec_t + { + return vec_ctf(v, 0); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_ALTIVEC_FLOAT4_H diff --git a/vec_avx_double4.h b/vec_avx_double4.h new file mode 100644 index 0000000..15f7edf --- /dev/null +++ b/vec_avx_double4.h @@ -0,0 +1,643 @@ +// -*-C++-*- + +#ifndef VEC_AVX_DOUBLE4_H +#define VEC_AVX_DOUBLE4_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include + +// AVX intrinsics +#include + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_DOUBLE_4 + template<> struct boolvec; + template<> struct intvec; + template<> struct realvec; + + + + template<> + struct boolvec: floatprops + { + static int const size = 4; + typedef bool scalar_t; + typedef __m256d bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return - uint_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + public: + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): + v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {} + boolvec(bool const* as): + v(_mm256_castsi256_pd(_mm256_set_epi64x(from_bool(as[3]), + from_bool(as[2]), + from_bool(as[1]), + from_bool(as[0])))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } + boolvec& set_elt(int n, bool a) + { + return ((uint_t*)&v)[n]=from_bool(a), *this; + } + + + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + + + boolvec operator!() const { return _mm256_xor_pd(boolvec(true), v); } + + boolvec operator&&(boolvec x) const { return _mm256_and_pd(v, x.v); } + boolvec operator||(boolvec x) const { return _mm256_or_pd(v, x.v); } + boolvec operator==(boolvec x) const { return !(*this!=x); } + boolvec operator!=(boolvec x) const { return _mm256_xor_pd(v, x.v); } + + bool all() const + { + // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3]; + return ! (! *this).any(); + } + bool any() const + { + // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3]; + return ! _mm256_testz_pd(v, v); + } + + + + // ifthen(condition, then-value, else-value) + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec + }; + + + + template<> + struct intvec: floatprops + { + static int const size = 4; + typedef int_t scalar_t; + typedef __m256i ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x): v(x) {} + intvec(int_t a): v(_mm256_set1_epi64x(a)) {} + intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {} + static intvec iota() { return _mm256_set_epi64x(3, 2, 1, 0); } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return ((int_t const*)&v)[n]; } + intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } + + + + boolvec_t as_bool() const { return _mm256_castsi256_pd(v); } + boolvec_t convert_bool() const + { + // Result: convert_bool(0)=false, convert_bool(else)=true + // There is no intrinsic to compare with zero. Instead, we check + // whether x is positive and x-1 is negative. + intvec x = *this; + // We know that boolvec values depend only on the sign bit + // return (~(x-1) | x).as_bool(); + // return x.as_bool() || !(x-1).as_bool(); + return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + + + // Note: not all arithmetic operations are supported! + + intvec operator+() const { return *this; } + intvec operator-() const { return IV(I(0)) - *this; } + + intvec operator+(intvec x) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_add_epi64(vlo, xvlo); + vhi = _mm_add_epi64(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator-(intvec x) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_sub_epi64(vlo, xvlo); + vhi = _mm_sub_epi64(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + + intvec& operator+=(intvec const& x) { return *this=*this+x; } + intvec& operator-=(intvec const& x) { return *this=*this-x; } + + + + intvec operator~() const { return IV(~U(0)) ^ *this; } + + intvec operator&(intvec x) const + { + return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(v), + _mm256_castsi256_pd(x.v))); + } + intvec operator|(intvec x) const + { + return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(v), + _mm256_castsi256_pd(x.v))); + } + intvec operator^(intvec x) const + { + return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(v), + _mm256_castsi256_pd(x.v))); + } + + intvec& operator&=(intvec const& x) { return *this=*this&x; } + intvec& operator|=(intvec const& x) { return *this=*this|x; } + intvec& operator^=(intvec const& x) { return *this=*this^x; } + + + + intvec lsr(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_srli_epi64(vlo, n); + vhi = _mm_srli_epi64(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator>>(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + // There is no _mm_srai_epi64. To emulate it, add 0x80000000 + // before shifting, and subtract the shifted 0x80000000 after + // shifting +#if 0 + __m128i signmask01 = _mm_sub_epi64(_mm_set1_epi64x(0), + _mm_srli_epi64(vlo, 63)); + __m128i signmask23 = _mm_sub_epi64(_mm_set1_epi64x(0), + _mm_srli_epi64(vhi, 63)); + vlo = _mm_xor_si128(signmask01, vlo); + vhi = _mm_xor_si128(signmask23, vhi); + vlo = _mm_srli_epi64(vlo, n); + vhi = _mm_srli_epi64(vhi, n); + vlo = _mm_xor_si128(signmask01, vlo); + vhi = _mm_xor_si128(signmask23, vhi); +#else + // Convert signed to unsiged + vlo = _mm_add_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1))); + vhi = _mm_add_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1))); + // Shift + vlo = _mm_srli_epi64(vlo, n); + vhi = _mm_srli_epi64(vhi, n); + // Undo conversion + vlo = _mm_sub_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1-n))); + vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1-n))); +#endif + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator<<(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_slli_epi64(vlo, n); + vhi = _mm_slli_epi64(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec& operator>>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); + } + return r; + } + intvec operator>>(intvec n) const + { + intvec r; + for (int i=0; i> n[i]); + } + return r; + } + intvec operator<<(intvec n) const + { + intvec r; + for (int i=0; i>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this< x); + } + boolvec_t operator>(intvec const& x) const + { + return x < *this; + } + boolvec_t operator>=(intvec const& x) const + { + return ! (*this < x); + } + }; + + + + template<> + struct realvec: floatprops + { + static int const size = 4; + typedef real_t scalar_t; + typedef __m256d vector_t; + static int const alignment = sizeof(vector_t); + + static char const* name() { return ""; } + void barrier() { __asm__("": "+x" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x): v(x) {} + realvec(real_t a): v(_mm256_set1_pd(a)) {} + realvec(real_t const* as): v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { return ((real_t const*)&v)[n]; } + realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } + + + + typedef vecmathlib::mask_t mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm256_load_pd(p); + } + static realvec_t loadu(real_t const* p) + { + return _mm256_loadu_pd(p); + } + static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm256_store_pd(p, v); + } + void storeu(real_t* p) const + { + return _mm256_storeu_pd(p, v); + } + void storeu(real_t* p, std::ptrdiff_t ioff) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + _mm256_maskstore_pd(p, m.m.as_int(), v); + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + if (m.m[2]) p[2] = (*this)[2]; + if (m.m[3]) p[3] = (*this)[3]; + } + } + void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff, m); + storeu(p+ioff, m); + } + + + + intvec_t as_int() const { return _mm256_castpd_si256(v); } + intvec_t convert_int() const { return MF::vml_convert_int(*this); } + + + + realvec operator+() const { return *this; } + realvec operator-() const { return RV(0.0) - *this; } + + realvec operator+(realvec x) const { return _mm256_add_pd(v, x.v); } + realvec operator-(realvec x) const { return _mm256_sub_pd(v, x.v); } + realvec operator*(realvec x) const { return _mm256_mul_pd(v, x.v); } + realvec operator/(realvec x) const { return _mm256_div_pd(v, x.v); } + + realvec& operator+=(realvec const& x) { return *this=*this+x; } + realvec& operator-=(realvec const& x) { return *this=*this-x; } + realvec& operator*=(realvec const& x) { return *this=*this*x; } + realvec& operator/=(realvec const& x) { return *this=*this/x; } + + real_t prod() const + { + return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; + } + real_t sum() const + { + // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; + // __m256d x = _mm256_hadd_pd(v, v); + // __m128d xlo = _mm256_extractf128_pd(x, 0); + // __m128d xhi = _mm256_extractf128_pd(x, 1); + realvec x = *this; + x = _mm256_hadd_pd(x.v, x.v); + return x[0] + x[2]; + } + + + + boolvec_t operator==(realvec const& x) const + { + return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ); + } + boolvec_t operator!=(realvec const& x) const + { + return _mm256_cmp_pd(v, x.v, _CMP_NEQ_OQ); + } + boolvec_t operator<(realvec const& x) const + { + return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ); + } + boolvec_t operator<=(realvec const& x) const + { + return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ); + } + boolvec_t operator>(realvec const& x) const + { + return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ); + } + boolvec_t operator>=(realvec const& x) const + { + return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ); + } + + + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const { return _mm256_ceil_pd(v); } + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return MF::vml_fabs(*this); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const { return _mm256_floor_pd(v); } + realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); } + realvec fmax(realvec y) const { return _mm256_max_pd(v, y.v); } + realvec fmin(realvec y) const { return _mm256_min_pd(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return _mm256_cmp_pd(v, v, _CMP_UNORD_Q); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const + { + return _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT); + } + realvec round() const { return MF::vml_round(*this); } + realvec rsqrt() const { return MF::vml_rsqrt(*this); } + boolvec_t signbit() const { return v; } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + realvec sqrt() const { return _mm256_sqrt_pd(v); } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const { return _mm256_round_pd(v, _MM_FROUND_TO_ZERO); } + }; + + + + // boolvec definitions + + inline + auto boolvec::as_int() const -> intvec_t + { + return _mm256_castpd_si256(v); + } + + inline + auto boolvec::convert_int() const -> intvec_t + { + //return ifthen(v, U(1), U(0)); + return lsr(as_int(), bits-1); + } + + inline + auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return ifthen(x.as_float(), y.as_float()).as_int(); + } + + inline + auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { + return _mm256_blendv_pd(y.v, x.v, v); + } + + + + // intvec definitions + + inline auto intvec::as_float() const -> realvec_t + { + return _mm256_castsi256_pd(v); + } + + inline auto intvec::convert_float() const -> realvec_t + { + return MF::vml_convert_float(*this); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_AVX_DOUBLE4_H diff --git a/vec_avx_float8.h b/vec_avx_float8.h new file mode 100644 index 0000000..e7e1187 --- /dev/null +++ b/vec_avx_float8.h @@ -0,0 +1,646 @@ +// -*-C++-*- + +#ifndef VEC_AVX_FLOAT8_H +#define VEC_AVX_FLOAT8_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include + +// AVX intrinsics +#include + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_FLOAT_8 + template<> struct boolvec; + template<> struct intvec; + template<> struct realvec; + + + + template<> + struct boolvec: floatprops + { + static int const size = 8; + typedef bool scalar_t; + typedef __m256 bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return - uint_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + public: + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): + v(_mm256_castsi256_ps(_mm256_set1_epi32(from_bool(a)))) {} + boolvec(bool const* as): + v(_mm256_castsi256_ps(_mm256_set_epi32(from_bool(as[7]), + from_bool(as[6]), + from_bool(as[5]), + from_bool(as[4]), + from_bool(as[3]), + from_bool(as[2]), + from_bool(as[1]), + from_bool(as[0])))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } + boolvec& set_elt(int n, bool a) + { + return ((uint_t*)&v)[n]=from_bool(a), *this; + } + + + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + + + boolvec operator!() const { return _mm256_xor_ps(boolvec(true), v); } + + boolvec operator&&(boolvec x) const { return _mm256_and_ps(v, x.v); } + boolvec operator||(boolvec x) const { return _mm256_or_ps(v, x.v); } + boolvec operator==(boolvec x) const { return !(*this!=x); } + boolvec operator!=(boolvec x) const { return _mm256_xor_ps(v, x.v); } + + bool all() const + { + // return + // (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] && + // (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7]; + return ! (! *this).any(); + } + bool any() const + { + // return + // (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] || + // (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7]; + return ! _mm256_testz_ps(v, v); + } + + + + // ifthen(condition, then-value, else-value) + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec + }; + + + + template<> + struct intvec: floatprops + { + static int const size = 8; + typedef int_t scalar_t; + typedef __m256i ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x): v(x) {} + intvec(int_t a): v(_mm256_set1_epi32(a)) {} + intvec(int_t const* as): v(_mm256_set_epi32(as[7], as[6], as[5], as[4], + as[3], as[2], as[1], as[0])) {} + static intvec iota() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return ((int_t const*)&v)[n]; } + intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } + + + + boolvec_t as_bool() const { return _mm256_castsi256_ps(v); } + boolvec_t convert_bool() const + { + // Result: convert_bool(0)=false, convert_bool(else)=true + // There is no intrinsic to compare with zero. Instead, we check + // whether x is positive and x-1 is negative. + intvec x = *this; + // We know that boolvec values depend only on the sign bit + // return (~(x-1) | x).as_bool(); + // return x.as_bool() || !(x-1).as_bool(); + return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + + + // Note: not all arithmetic operations are supported! + + intvec operator+() const { return *this; } + intvec operator-() const { return IV(0) - *this; } + + intvec operator+(intvec x) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_add_epi32(vlo, xvlo); + vhi = _mm_add_epi32(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator-(intvec x) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_sub_epi32(vlo, xvlo); + vhi = _mm_sub_epi32(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + + intvec& operator+=(intvec const& x) { return *this=*this+x; } + intvec& operator-=(intvec const& x) { return *this=*this-x; } + + + + intvec operator~() const { return IV(~U(0)) ^ *this; } + + intvec operator&(intvec x) const + { + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + intvec operator|(intvec x) const + { + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + intvec operator^(intvec x) const + { + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + + intvec& operator&=(intvec const& x) { return *this=*this&x; } + intvec& operator|=(intvec const& x) { return *this=*this|x; } + intvec& operator^=(intvec const& x) { return *this=*this^x; } + + + + intvec lsr(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_srli_epi32(vlo, n); + vhi = _mm_srli_epi32(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator>>(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_srai_epi32(vlo, n); + vhi = _mm_srai_epi32(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator<<(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_slli_epi32(vlo, n); + vhi = _mm_slli_epi32(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec& operator>>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); + } + return r; + } + intvec operator>>(intvec n) const + { + intvec r; + for (int i=0; i> n[i]); + } + return r; + } + intvec operator<<(intvec n) const + { + intvec r; + for (int i=0; i>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this< x); + } + boolvec_t operator>(intvec const& x) const + { + return x < *this; + } + boolvec_t operator>=(intvec const& x) const + { + return ! (*this < x); + } + }; + + + + template<> + struct realvec: floatprops + { + static int const size = 8; + typedef real_t scalar_t; + typedef __m256 vector_t; + static int const alignment = sizeof(vector_t); + + static char const* name() { return ""; } + void barrier() { __asm__("": "+x" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x): v(x) {} + realvec(real_t a): v(_mm256_set1_ps(a)) {} + realvec(real_t const* as): v(_mm256_set_ps(as[7], as[6], as[5], as[4], + as[3], as[2], as[1], as[0])) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { return ((real_t const*)&v)[n]; } + realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } + + + + typedef vecmathlib::mask_t mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm256_load_ps(p); + } + static realvec_t loadu(real_t const* p) + { + return _mm256_loadu_ps(p); + } + static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm256_store_ps(p, v); + } + void storeu(real_t* p) const + { + return _mm256_storeu_ps(p, v); + } + void storeu(real_t* p, std::ptrdiff_t ioff) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + _mm256_maskstore_ps(p, m.m.as_int(), v); + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + // TODO: this is expensive + for (int n=0; n(realvec const& x) const + { + return _mm256_cmp_ps(v, x.v, _CMP_GT_OQ); + } + boolvec_t operator>=(realvec const& x) const + { + return _mm256_cmp_ps(v, x.v, _CMP_GE_OQ); + } + + + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const { return _mm256_ceil_ps(v); } + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return MF::vml_fabs(*this); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const { return _mm256_floor_ps(v); } + realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); } + realvec fmax(realvec y) const { return _mm256_max_ps(v, y.v); } + realvec fmin(realvec y) const { return _mm256_min_ps(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return _mm256_cmp_ps(v, v, _CMP_UNORD_Q); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const + { + realvec x = *this; + realvec r = _mm256_rcp_ps(x); // this is only an approximation + r *= RV(2.0) - r*x; // one Newton iteration (see vml_rcp) + return r; + } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const + { + return _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT); + } + realvec round() const { return MF::vml_round(*this); } + realvec rsqrt() const + { + realvec x = *this; + realvec r = _mm256_rsqrt_ps(x); // this is only an approximation + r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt) + return r; + } + boolvec_t signbit() const { return v; } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + realvec sqrt() const { return _mm256_sqrt_ps(v); } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const { return _mm256_round_ps(v, _MM_FROUND_TO_ZERO); } + }; + + + + // boolvec definitions + + inline + auto boolvec::as_int() const -> intvec_t + { + return _mm256_castps_si256(v); + } + + inline + auto boolvec::convert_int() const -> intvec_t + { + return lsr(as_int(), bits-1); + } + + inline + auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return ifthen(x.as_float(), y.as_float()).as_int(); + } + + inline + auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { + return _mm256_blendv_ps(y.v, x.v, v); + } + + + + // intvec definitions + + inline auto intvec::as_float() const -> realvec_t + { + return _mm256_castsi256_ps(v); + } + + inline auto intvec::convert_float() const -> realvec_t + { + return _mm256_cvtepi32_ps(v); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_AVX_FLOAT8_H diff --git a/vec_avx_fp16_16.h b/vec_avx_fp16_16.h new file mode 100644 index 0000000..7cb98b2 --- /dev/null +++ b/vec_avx_fp16_16.h @@ -0,0 +1,582 @@ +// -*-C++-*- + +#ifndef VEC_AVX_FP16_16_H +#define VEC_AVX_FP16_16_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include + +// AVX intrinsics +#include + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_FP16_16 + template<> struct boolvec; + template<> struct intvec; + template<> struct realvec; + + + + template<> + struct boolvec: floatprops + { + static int const size = 16; + typedef bool scalar_t; + typedef __m256i bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return - uint_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + public: + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): v(_mm256_set1_epi16(from_bool(a))) {} + boolvec(bool const* as): + v(_mm256_set_epi16(from_bool(as[15]), + from_bool(as[14]), + from_bool(as[13]), + from_bool(as[12]), + from_bool(as[11]), + from_bool(as[10]), + from_bool(as[ 9]), + from_bool(as[ 8]), + from_bool(as[ 7]), + from_bool(as[ 6]), + from_bool(as[ 5]), + from_bool(as[ 4]), + from_bool(as[ 3]), + from_bool(as[ 2]), + from_bool(as[ 1]), + from_bool(as[ 0]))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } + boolvec& set_elt(int n, bool a) + { + return ((uint_t*)&v)[n]=from_bool(a), *this; + } + + + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + + + boolvec operator!() const { return *this != boolvec(true); } + + boolvec operator&&(boolvec x) const + { + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + boolvec operator||(boolvec x) const + { + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + boolvec operator==(boolvec x) const { return !(*this!=x); } + boolvec operator!=(boolvec x) const + { + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + + bool all() const + { + bool r = true; + for (int n=0; n + struct intvec: floatprops + { + static int const size = 16; + typedef int_t scalar_t; + typedef __m256i ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x): v(x) {} + intvec(int_t a): v(_mm256_set1_epi16(a)) {} + intvec(int_t const* as): + v(_mm256_set_epi16(as[15], + as[14], + as[13], + as[12], + as[11], + as[10], + as[ 9], + as[ 8], + as[ 7], + as[ 6], + as[ 5], + as[ 4], + as[ 3], + as[ 2], + as[ 1], + as[ 0])) {} + static intvec iota() + { + return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0); + } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return ((int_t const*)&v)[n]; } + intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } + + + + boolvec_t as_bool() const { return v; } + boolvec_t convert_bool() const + { + // Result: convert_bool(0)=false, convert_bool(else)=true + // There is no intrinsic to compare with zero. Instead, we check + // whether x is positive and x-1 is negative. + intvec x = *this; + // We know that boolvec values depend only on the sign bit + // return (~(x-1) | x).as_bool(); + // return x.as_bool() || !(x-1).as_bool(); + return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + + + // Note: not all arithmetic operations are supported! + + intvec operator+() const { return *this; } + intvec operator-() const { return IV(I(0)) - *this; } + + intvec operator+(intvec x) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_add_epi16(vlo, xvlo); + vhi = _mm_add_epi16(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator-(intvec x) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_sub_epi16(vlo, xvlo); + vhi = _mm_sub_epi16(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + + intvec& operator+=(intvec const& x) { return *this=*this+x; } + intvec& operator-=(intvec const& x) { return *this=*this-x; } + + + + intvec operator~() const { return IV(~U(0)) ^ *this; } + + intvec operator&(intvec x) const + { + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + intvec operator|(intvec x) const + { + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + intvec operator^(intvec x) const + { + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + + intvec& operator&=(intvec const& x) { return *this=*this&x; } + intvec& operator|=(intvec const& x) { return *this=*this|x; } + intvec& operator^=(intvec const& x) { return *this=*this^x; } + + + + intvec lsr(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_srli_epi16(vlo, n); + vhi = _mm_srli_epi16(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator>>(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_srai_epi16(vlo, n); + vhi = _mm_srai_epi16(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator<<(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_slli_epi16(vlo, n); + vhi = _mm_slli_epi16(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec& operator>>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); + } + return r; + } + intvec operator>>(intvec n) const + { + intvec r; + for (int i=0; i> n[i]); + } + return r; + } + intvec operator<<(intvec n) const + { + intvec r; + for (int i=0; i>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this< + struct realvec: floatprops + { + static int const size = 16; + typedef real_t scalar_t; + typedef __m256i vector_t; + static int const alignment = sizeof(vector_t); + + static char const* name() { return ""; } + void barrier() { __asm__("": "+x" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x): v(x) {} + realvec(real_t a): v(_mm256_set1_epi16(FP::as_int(a))) {} + realvec(real_t const* as): + v(_mm256_set_epi16(FP::as_int(as[15]), + FP::as_int(as[14]), + FP::as_int(as[13]), + FP::as_int(as[12]), + FP::as_int(as[11]), + FP::as_int(as[10]), + FP::as_int(as[ 9]), + FP::as_int(as[ 8]), + FP::as_int(as[ 7]), + FP::as_int(as[ 6]), + FP::as_int(as[ 5]), + FP::as_int(as[ 4]), + FP::as_int(as[ 3]), + FP::as_int(as[ 2]), + FP::as_int(as[ 1]), + FP::as_int(as[ 0]))) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { return ((real_t const*)&v)[n]; } + realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } + + + + typedef vecmathlib::mask_t mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm256_load_si256((__m256i const*)p); + } + static realvec_t loadu(real_t const* p) + { + return _mm256_loadu_si256((__m256i const*)p); + } + static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm256_store_si256((__m256i*)p, v); + } + void storeu(real_t* p) const + { + return _mm256_storeu_si256((__m256i*)p, v); + } + void storeu(real_t* p, std::ptrdiff_t ioff) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + // TODO: this is expensive + for (int n=0; n(realvec const& x) const { __builtin_unreachable(); } + boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); } + + + + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec fabs() const { return MF::vml_fabs(*this); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + boolvec_t signbit() const { return v; } + }; + + + + // boolvec definitions + + inline + auto boolvec::as_int() const -> intvec_t + { + return v; + } + + inline + auto boolvec::convert_int() const -> intvec_t + { + return lsr(as_int(), bits-1); + } + + inline + auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return ifthen(x.as_float(), y.as_float()).as_int(); + } + + inline + auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { + return (( -convert_int() & x.as_int()) | + (~-convert_int() & y.as_int())).as_float(); + } + + + + // intvec definitions + + inline auto intvec::as_float() const -> realvec_t + { + return v; + } + + inline auto intvec::convert_float() const -> realvec_t + { + __builtin_unreachable(); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_AVX_FP16_16_H diff --git a/vec_avx_fp8_32.h b/vec_avx_fp8_32.h new file mode 100644 index 0000000..a6e33a1 --- /dev/null +++ b/vec_avx_fp8_32.h @@ -0,0 +1,648 @@ +// -*-C++-*- + +#ifndef VEC_AVX_FP8_32_H +#define VEC_AVX_FP8_32_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include + +// AVX intrinsics +#include + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_FP8_32 + template<> struct boolvec; + template<> struct intvec; + template<> struct realvec; + + + + template<> + struct boolvec: floatprops + { + static int const size = 32; + typedef bool scalar_t; + typedef __m256i bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return - uint_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + public: + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): v(_mm256_set1_epi8(from_bool(a))) {} + boolvec(bool const* as): + v(_mm256_set_epi8(from_bool(as[31]), + from_bool(as[30]), + from_bool(as[29]), + from_bool(as[28]), + from_bool(as[27]), + from_bool(as[26]), + from_bool(as[25]), + from_bool(as[24]), + from_bool(as[23]), + from_bool(as[22]), + from_bool(as[21]), + from_bool(as[20]), + from_bool(as[19]), + from_bool(as[18]), + from_bool(as[17]), + from_bool(as[16]), + from_bool(as[15]), + from_bool(as[14]), + from_bool(as[13]), + from_bool(as[12]), + from_bool(as[11]), + from_bool(as[10]), + from_bool(as[ 9]), + from_bool(as[ 8]), + from_bool(as[ 7]), + from_bool(as[ 6]), + from_bool(as[ 5]), + from_bool(as[ 4]), + from_bool(as[ 3]), + from_bool(as[ 2]), + from_bool(as[ 1]), + from_bool(as[ 0]))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } + boolvec& set_elt(int n, bool a) + { + return ((uint_t*)&v)[n]=from_bool(a), *this; + } + + + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + + + boolvec operator!() const { return *this != boolvec(true); } + + boolvec operator&&(boolvec x) const + { + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + boolvec operator||(boolvec x) const + { + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + boolvec operator==(boolvec x) const { return !(*this!=x); } + boolvec operator!=(boolvec x) const + { + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + + bool all() const + { + bool r = true; + for (int n=0; n + struct intvec: floatprops + { + static int const size = 32; + typedef int_t scalar_t; + typedef __m256i ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x): v(x) {} + intvec(int_t a): v(_mm256_set1_epi8(a)) {} + intvec(int_t const* as): + v(_mm256_set_epi8(as[31], + as[30], + as[29], + as[28], + as[27], + as[26], + as[25], + as[24], + as[23], + as[22], + as[21], + as[20], + as[19], + as[18], + as[17], + as[16], + as[15], + as[14], + as[13], + as[12], + as[11], + as[10], + as[ 9], + as[ 8], + as[ 7], + as[ 6], + as[ 5], + as[ 4], + as[ 3], + as[ 2], + as[ 1], + as[ 0])) {} + static intvec iota() + { + return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0); + } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return ((int_t const*)&v)[n]; } + intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } + + + + boolvec_t as_bool() const { return v; } + boolvec_t convert_bool() const + { + // Result: convert_bool(0)=false, convert_bool(else)=true + // There is no intrinsic to compare with zero. Instead, we check + // whether x is positive and x-1 is negative. + intvec x = *this; + // We know that boolvec values depend only on the sign bit + // return (~(x-1) | x).as_bool(); + // return x.as_bool() || !(x-1).as_bool(); + return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + + + // Note: not all arithmetic operations are supported! + + intvec operator+() const { return *this; } + intvec operator-() const { return IV(I(0)) - *this; } + + intvec operator+(intvec x) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_add_epi8(vlo, xvlo); + vhi = _mm_add_epi8(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator-(intvec x) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_sub_epi8(vlo, xvlo); + vhi = _mm_sub_epi8(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + + intvec& operator+=(intvec const& x) { return *this=*this+x; } + intvec& operator-=(intvec const& x) { return *this=*this-x; } + + + + intvec operator~() const { return IV(~U(0)) ^ *this; } + + intvec operator&(intvec x) const + { + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + intvec operator|(intvec x) const + { + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + intvec operator^(intvec x) const + { + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + + intvec& operator&=(intvec const& x) { return *this=*this&x; } + intvec& operator|=(intvec const& x) { return *this=*this|x; } + intvec& operator^=(intvec const& x) { return *this=*this^x; } + + + + intvec lsr(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + uint_t masklo = U(0x00ffU) >> U(n); + uint_t maskhi = U(0xff00U); + __m128i mask = _mm_set1_epi16(masklo | maskhi); + vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask); + vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator>>(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + uint_t masklo = U(0x00ffU); + uint_t maskhi = U(0xff00U); + __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n+8), + _mm_set1_epi16(masklo)); + __m128i vlohi = _mm_and_si128(_mm_srai_epi16(vlo, n), + _mm_set1_epi16(maskhi)); + vlo = _mm_or_si128(vlolo, vlohi); + __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n+8), + _mm_set1_epi16(masklo)); + __m128i vhihi = _mm_and_si128(_mm_srai_epi16(vhi, n), + _mm_set1_epi16(maskhi)); + vhi = _mm_or_si128(vhilo, vhihi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator<<(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + uint_t masklo = U(0x00ffU); + uint_t maskhi = U(0xff00U) << U(n); + __m128i mask = _mm_set1_epi16(masklo | maskhi); + vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask); + vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec& operator>>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); + } + return r; + } + intvec operator>>(intvec n) const + { + intvec r; + for (int i=0; i> n[i]); + } + return r; + } + intvec operator<<(intvec n) const + { + intvec r; + for (int i=0; i>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this< + struct realvec: floatprops + { + static int const size = 32; + typedef real_t scalar_t; + typedef __m256i vector_t; + static int const alignment = sizeof(vector_t); + + static char const* name() { return ""; } + void barrier() { __asm__("": "+x" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x): v(x) {} + realvec(real_t a): v(_mm256_set1_epi8(FP::as_int(a))) {} + realvec(real_t const* as): + v(_mm256_set_epi8(FP::as_int(as[31]), + FP::as_int(as[30]), + FP::as_int(as[29]), + FP::as_int(as[28]), + FP::as_int(as[27]), + FP::as_int(as[26]), + FP::as_int(as[25]), + FP::as_int(as[24]), + FP::as_int(as[23]), + FP::as_int(as[22]), + FP::as_int(as[21]), + FP::as_int(as[20]), + FP::as_int(as[19]), + FP::as_int(as[18]), + FP::as_int(as[17]), + FP::as_int(as[16]), + FP::as_int(as[15]), + FP::as_int(as[14]), + FP::as_int(as[13]), + FP::as_int(as[12]), + FP::as_int(as[11]), + FP::as_int(as[10]), + FP::as_int(as[ 9]), + FP::as_int(as[ 8]), + FP::as_int(as[ 7]), + FP::as_int(as[ 6]), + FP::as_int(as[ 5]), + FP::as_int(as[ 4]), + FP::as_int(as[ 3]), + FP::as_int(as[ 2]), + FP::as_int(as[ 1]), + FP::as_int(as[ 0]))) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { return ((real_t const*)&v)[n]; } + realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } + + + + typedef vecmathlib::mask_t mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm256_load_si256((__m256i const*)p); + } + static realvec_t loadu(real_t const* p) + { + return _mm256_loadu_si256((__m256i const*)p); + } + static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm256_store_si256((__m256i*)p, v); + } + void storeu(real_t* p) const + { + return _mm256_storeu_si256((__m256i*)p, v); + } + void storeu(real_t* p, std::ptrdiff_t ioff) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + // TODO: this is expensive + for (int n=0; n(realvec const& x) const { __builtin_unreachable(); } + boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); } + + + + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec fabs() const { return MF::vml_fabs(*this); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + boolvec_t signbit() const { return v; } + }; + + + + // boolvec definitions + + inline + auto boolvec::as_int() const -> intvec_t + { + return v; + } + + inline + auto boolvec::convert_int() const -> intvec_t + { + return lsr(as_int(), bits-1); + } + + inline + auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return ifthen(x.as_float(), y.as_float()).as_int(); + } + + inline + auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { + return (( -convert_int() & x.as_int()) | + (~-convert_int() & y.as_int())).as_float(); + } + + + + // intvec definitions + + inline auto intvec::as_float() const -> realvec_t + { + return v; + } + + inline auto intvec::convert_float() const -> realvec_t + { + __builtin_unreachable(); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_AVX_FP8_32_H diff --git a/vec_double_avx.h b/vec_double_avx.h deleted file mode 100644 index cf5e97c..0000000 --- a/vec_double_avx.h +++ /dev/null @@ -1,643 +0,0 @@ -// -*-C++-*- - -#ifndef VEC_DOUBLE_AVX_H -#define VEC_DOUBLE_AVX_H - -#include "floatprops.h" -#include "mathfuncs.h" -#include "vec_base.h" - -#include - -// AVX intrinsics -#include - - - -namespace vecmathlib { - -#define VECMATHLIB_HAVE_VEC_DOUBLE_4 - template<> struct boolvec; - template<> struct intvec; - template<> struct realvec; - - - - template<> - struct boolvec: floatprops - { - static int const size = 4; - typedef bool scalar_t; - typedef __m256d bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values have the sign bit set, false values have it unset - static uint_t from_bool(bool a) { return - uint_t(a); } - static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } - public: - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): - v(_mm256_castsi256_pd(_mm256_set1_epi64x(from_bool(a)))) {} - boolvec(bool const* as): - v(_mm256_castsi256_pd(_mm256_set_epi64x(from_bool(as[3]), - from_bool(as[2]), - from_bool(as[1]), - from_bool(as[0])))) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } - boolvec& set_elt(int n, bool a) - { - return ((uint_t*)&v)[n]=from_bool(a), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return _mm256_xor_pd(boolvec(true), v); } - - boolvec operator&&(boolvec x) const { return _mm256_and_pd(v, x.v); } - boolvec operator||(boolvec x) const { return _mm256_or_pd(v, x.v); } - boolvec operator==(boolvec x) const { return !(*this!=x); } - boolvec operator!=(boolvec x) const { return _mm256_xor_pd(v, x.v); } - - bool all() const - { - // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3]; - return ! (! *this).any(); - } - bool any() const - { - // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3]; - return ! _mm256_testz_pd(v, v); - } - - - - // ifthen(condition, then-value, else-value) - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec: floatprops - { - static int const size = 4; - typedef int_t scalar_t; - typedef __m256i ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm256_set1_epi64x(a)) {} - intvec(int_t const* as): v(_mm256_set_epi64x(as[3], as[2], as[1], as[0])) {} - static intvec iota() { return _mm256_set_epi64x(3, 2, 1, 0); } - - operator ivector_t() const { return v; } - int_t operator[](int n) const { return ((int_t const*)&v)[n]; } - intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } - - - - boolvec_t as_bool() const { return _mm256_castsi256_pd(v); } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare with zero. Instead, we check - // whether x is positive and x-1 is negative. - intvec x = *this; - // We know that boolvec values depend only on the sign bit - // return (~(x-1) | x).as_bool(); - // return x.as_bool() || !(x-1).as_bool(); - return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec operator+() const { return *this; } - intvec operator-() const { return IV(I(0)) - *this; } - - intvec operator+(intvec x) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_add_epi64(vlo, xvlo); - vhi = _mm_add_epi64(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec operator-(intvec x) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_sub_epi64(vlo, xvlo); - vhi = _mm_sub_epi64(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - - - - intvec operator~() const { return IV(~U(0)) ^ *this; } - - intvec operator&(intvec x) const - { - return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(v), - _mm256_castsi256_pd(x.v))); - } - intvec operator|(intvec x) const - { - return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(v), - _mm256_castsi256_pd(x.v))); - } - intvec operator^(intvec x) const - { - return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(v), - _mm256_castsi256_pd(x.v))); - } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - - - intvec lsr(int_t n) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_srli_epi64(vlo, n); - vhi = _mm_srli_epi64(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec operator>>(int_t n) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - // There is no _mm_srai_epi64. To emulate it, add 0x80000000 - // before shifting, and subtract the shifted 0x80000000 after - // shifting -#if 0 - __m128i signmask01 = _mm_sub_epi64(_mm_set1_epi64x(0), - _mm_srli_epi64(vlo, 63)); - __m128i signmask23 = _mm_sub_epi64(_mm_set1_epi64x(0), - _mm_srli_epi64(vhi, 63)); - vlo = _mm_xor_si128(signmask01, vlo); - vhi = _mm_xor_si128(signmask23, vhi); - vlo = _mm_srli_epi64(vlo, n); - vhi = _mm_srli_epi64(vhi, n); - vlo = _mm_xor_si128(signmask01, vlo); - vhi = _mm_xor_si128(signmask23, vhi); -#else - // Convert signed to unsiged - vlo = _mm_add_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1))); - vhi = _mm_add_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1))); - // Shift - vlo = _mm_srli_epi64(vlo, n); - vhi = _mm_srli_epi64(vhi, n); - // Undo conversion - vlo = _mm_sub_epi64(vlo, _mm_set1_epi64x(U(1) << (bits-1-n))); - vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1-n))); -#endif - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec operator<<(int_t n) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_slli_epi64(vlo, n); - vhi = _mm_slli_epi64(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); - } - return r; - } - intvec operator>>(intvec n) const - { - intvec r; - for (int i=0; i> n[i]); - } - return r; - } - intvec operator<<(intvec n) const - { - intvec r; - for (int i=0; i>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this< x); - } - boolvec_t operator>(intvec const& x) const - { - return x < *this; - } - boolvec_t operator>=(intvec const& x) const - { - return ! (*this < x); - } - }; - - - - template<> - struct realvec: floatprops - { - static int const size = 4; - typedef real_t scalar_t; - typedef __m256d vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return ""; } - void barrier() { __asm__("": "+x" (v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm256_set1_pd(a)) {} - realvec(real_t const* as): v(_mm256_set_pd(as[3], as[2], as[1], as[0])) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const { return ((real_t const*)&v)[n]; } - realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } - - - - typedef vecmathlib::mask_t mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm256_load_pd(p); - } - static realvec_t loadu(real_t const* p) - { - return _mm256_loadu_pd(p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm256_store_pd(p, v); - } - void storeu(real_t* p) const - { - return _mm256_storeu_pd(p, v); - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - _mm256_maskstore_pd(p, m.m.as_int(), v); - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return _mm256_castpd_si256(v); } - intvec_t convert_int() const { return MF::vml_convert_int(*this); } - - - - realvec operator+() const { return *this; } - realvec operator-() const { return RV(0.0) - *this; } - - realvec operator+(realvec x) const { return _mm256_add_pd(v, x.v); } - realvec operator-(realvec x) const { return _mm256_sub_pd(v, x.v); } - realvec operator*(realvec x) const { return _mm256_mul_pd(v, x.v); } - realvec operator/(realvec x) const { return _mm256_div_pd(v, x.v); } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t prod() const - { - return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; - } - real_t sum() const - { - // return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; - // __m256d x = _mm256_hadd_pd(v, v); - // __m128d xlo = _mm256_extractf128_pd(x, 0); - // __m128d xhi = _mm256_extractf128_pd(x, 1); - realvec x = *this; - x = _mm256_hadd_pd(x.v, x.v); - return x[0] + x[2]; - } - - - - boolvec_t operator==(realvec const& x) const - { - return _mm256_cmp_pd(v, x.v, _CMP_EQ_OQ); - } - boolvec_t operator!=(realvec const& x) const - { - return _mm256_cmp_pd(v, x.v, _CMP_NEQ_OQ); - } - boolvec_t operator<(realvec const& x) const - { - return _mm256_cmp_pd(v, x.v, _CMP_LT_OQ); - } - boolvec_t operator<=(realvec const& x) const - { - return _mm256_cmp_pd(v, x.v, _CMP_LE_OQ); - } - boolvec_t operator>(realvec const& x) const - { - return _mm256_cmp_pd(v, x.v, _CMP_GT_OQ); - } - boolvec_t operator>=(realvec const& x) const - { - return _mm256_cmp_pd(v, x.v, _CMP_GE_OQ); - } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const { return _mm256_ceil_pd(v); } - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return MF::vml_fabs(*this); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const { return _mm256_floor_pd(v); } - realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); } - realvec fmax(realvec y) const { return _mm256_max_pd(v, y.v); } - realvec fmin(realvec y) const { return _mm256_min_pd(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return _mm256_cmp_pd(v, v, _CMP_UNORD_Q); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const { return _mm256_div_pd(_mm256_set1_pd(1.0), v); } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const - { - return _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT); - } - realvec round() const { return MF::vml_round(*this); } - realvec rsqrt() const { return MF::vml_rsqrt(*this); } - boolvec_t signbit() const { return v; } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - realvec sqrt() const { return _mm256_sqrt_pd(v); } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const { return _mm256_round_pd(v, _MM_FROUND_TO_ZERO); } - }; - - - - // boolvec definitions - - inline - auto boolvec::as_int() const -> intvec_t - { - return _mm256_castpd_si256(v); - } - - inline - auto boolvec::convert_int() const -> intvec_t - { - //return ifthen(v, U(1), U(0)); - return lsr(as_int(), bits-1); - } - - inline - auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t - { - return ifthen(x.as_float(), y.as_float()).as_int(); - } - - inline - auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t - { - return _mm256_blendv_pd(y.v, x.v, v); - } - - - - // intvec definitions - - inline auto intvec::as_float() const -> realvec_t - { - return _mm256_castsi256_pd(v); - } - - inline auto intvec::convert_float() const -> realvec_t - { - return MF::vml_convert_float(*this); - } - -} // namespace vecmathlib - -#endif // #ifndef VEC_DOUBLE_AVX_H diff --git a/vec_double_qpx.h b/vec_double_qpx.h deleted file mode 100644 index 8ffdf67..0000000 --- a/vec_double_qpx.h +++ /dev/null @@ -1,667 +0,0 @@ -// -*-C++-*- - -#ifndef VEC_DOUBLE_QPX_H -#define VEC_DOUBLE_QPX_H - -#include "floatprops.h" -#include "mathfuncs.h" -#include "vec_base.h" - -#include -#warning "TODO" -#include - -// QPX intrinsics -#ifdef __clang__ -# include -#else -# include -#endif -#include - - - -namespace vecmathlib { - -#define VECMATHLIB_HAVE_VEC_DOUBLE_4 - template<> struct boolvec; - template<> struct intvec; - template<> struct realvec; - - - - template<> - struct boolvec: floatprops - { - static int const size = 4; - typedef bool scalar_t; - typedef vector4double bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // canonical true is +1.0, canonical false is -1.0 - // >=0 is true, -0 is true, nan is false - static real_t from_bool(bool a) { return a ? +1.0 : -1.0; } - static bool to_bool(real_t a) { return a>=0.0; } - public: - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(vec_splats(from_bool(a))) {} - boolvec(bool const* as) - { - for (int d=0; d - struct intvec: floatprops - { - static int const size = 4; - typedef int_t scalar_t; - typedef vector4double ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(vec_splats(FP::as_float(a))) {} - intvec(int_t const* as) - { - for (int d=0; d> U(n)); - return r; - } - intvec operator>>(int_t n) const - { - intvec r; - for (int d=0; d> n); - return r; - } - intvec operator<<(int_t n) const - { - intvec r; - for (int d=0; d>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<> U(n[d])); - return r; - } - intvec operator>>(intvec n) const - { - intvec r; - for (int d=0; d> n[d]); - return r; - } - intvec operator<<(intvec n) const - { - intvec r; - for (int d=0; d>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<(intvec const& x) const - { - boolvec_t r; - for (int d=0; d x[d]); - return r; - } - boolvec_t operator>=(intvec const& x) const - { - boolvec_t r; - for (int d=0; d= x[d]); - return r; - } - }; - - - - template<> - struct realvec: floatprops - { - static int const size = 4; - typedef real_t scalar_t; - typedef vector4double vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return ""; } - void barrier() { __asm__("": "+v" (v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(vec_splats(a)) {} - realvec(real_t const* as) - { - for (int d=0; d mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return vec_lda(0, (real_t*)p); - } - static realvec_t loadu(real_t const* p) - { - realvec_t v0 = vec_ld(0, (real_t*)p); - realvec_t v1 = vec_ld(31, (real_t*)p); - return vec_perm(v0.v, v1.v, vec_lvsl(0, (real_t*)p)); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - // TODO: use load instruction with fixed offset - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - // TODO: use load instruction with fixed offset - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); -#warning "TODO" - std::cout << "yes this is storea\n"; - vec_sta(v, 0, p); - } - void storeu(real_t* p) const - { - // Vector stores would require vector loads, which would need to - // be atomic - // TODO: see for good ideas - p[0] = (*this)[0]; - p[1] = (*this)[1]; - p[2] = (*this)[2]; - p[3] = (*this)[3]; - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return v; } - intvec_t convert_int() const { return vec_ctid(v); } - - - - realvec operator+() const { return *this; } - realvec operator-() const { return vec_neg(v); } - - realvec operator+(realvec x) const { return vec_add(v, x.v); } - realvec operator-(realvec x) const { return vec_sub(v, x.v); } - realvec operator*(realvec x) const { return vec_mul(v, x.v); } - realvec operator/(realvec x) const - { - // return vec_swdiv_nochk(v, x.v); - return div_fastd4(v, x.v); - } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t prod() const - { - return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; - } - real_t sum() const - { - return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; - } - - - - boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); } - boolvec_t operator!=(realvec const& x) const { return ! (*this == x); } - boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); } - boolvec_t operator<=(realvec const& x) const { return ! (*this > x); } - boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); } - boolvec_t operator>=(realvec const& x) const { return ! (*this < x); } - - - - realvec acos() const { return acosd4(v); } - realvec acosh() const { return acoshd4(v); } - realvec asin() const { return asind4(v); } - realvec asinh() const { return asinhd4(v); } - realvec atan() const { return atand4(v); } - realvec atan2(realvec y) const { return atan2d4(v, y.v); } - realvec atanh() const { return atanhd4(v); } - realvec cbrt() const { return cbrtd4(v); } - realvec ceil() const { return vec_ceil(v); } - realvec copysign(realvec y) const { return vec_cpsgn(v, y.v); } - realvec cos() const { return cosd4(v); } - realvec cosh() const { return coshd4(v); } - realvec exp() const { return expd4(v); } - realvec exp10() const { return exp10d4(v); } - realvec exp2() const { return exp2d4(v); } - realvec expm1() const { return expm1d4(v); } - realvec fabs() const { return vec_abs(v); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const { return vec_floor(v); } - realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); } - realvec fmax(realvec y) const { return MF::vml_fmax(v, y.v); } - realvec fmin(realvec y) const { return MF::vml_fmin(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec hypot(realvec y) const { return hypotd4(v, y.v); } - intvec_t ilogb() const - { - int_t ilogb_[] = { - ::ilogb((*this)[0]), - ::ilogb((*this)[1]), - ::ilogb((*this)[2]), - ::ilogb((*this)[3]) - }; - return intvec_t(ilogb_); - } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return vec_tstnan(v, v); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return ldexp(intvec_t(n)); } - realvec ldexp(intvec_t n) const - { - real_t ldexp_[] = { - std::ldexp((*this)[0], n[0]), - std::ldexp((*this)[1], n[1]), - std::ldexp((*this)[2], n[2]), - std::ldexp((*this)[3], n[3]) - }; - return realvec_t(ldexp_); - } - realvec log() const { return logd4(v); } - realvec log10() const { return log10d4(v); } - realvec log1p() const { return log1pd4(v); } - realvec log2() const { return log2d4(v); } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return powd4(v, y.v); } - realvec rcp() const { return recip_fastd4(v); } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const { return MF::vml_rint(*this); } - realvec round() const { return vec_round(v); } - realvec rsqrt() const - { - realvec x = *this; - realvec r = vec_rsqrte(x.v); // this is only an approximation - // TODO: use fma - // one Newton iteration (see vml_rsqrt) - r += RV(0.5)*r * (RV(1.0) - x * r*r); - return r; - } - boolvec_t signbit() const { return !copysign(RV(1.0)).as_int().as_bool(); } - realvec sin() const { return sind4(v); } - realvec sinh() const { return sinhd4(v); } - realvec sqrt() const - { - // return vec_sqrtsw_nochk(v); - return *this * rsqrt(); - } - realvec tan() const { return tand4(v); } - realvec tanh() const { return tanhd4(v); } - realvec trunc() const { return vec_trunc(v); } - }; - - - - // boolvec definitions - - inline - boolvec::intvec_t boolvec::as_int() const - { - return v; - } - - inline - boolvec::intvec_t boolvec::convert_int() const - { - return ifthen(IV(I(1)), IV(I(0))); - } - - inline - boolvec::intvec_t boolvec::ifthen(intvec_t x, - intvec_t y) const - { - return ifthen(x.as_float(), y.as_float()).as_int(); - } - - inline - boolvec::realvec_t boolvec::ifthen(realvec_t x, - realvec_t y) const - { - return vec_sel(y.v, x.v, v); - } - - - - // intvec definitions - - inline intvec::realvec_t intvec::as_float() const - { - return v; - } - - inline intvec::realvec_t intvec::convert_float() const - { - return vec_cfid(v); - } - -} // namespace vecmathlib - -#endif // #ifndef VEC_DOUBLE_QPX_H diff --git a/vec_double_sse2.h b/vec_double_sse2.h deleted file mode 100644 index 7c31d2d..0000000 --- a/vec_double_sse2.h +++ /dev/null @@ -1,646 +0,0 @@ -// -*-C++-*- - -#ifndef VEC_DOUBLE_SSE2_H -#define VEC_DOUBLE_SSE2_H - -#include "floatprops.h" -#include "mathfuncs.h" -#include "vec_base.h" - -#include - -// SSE2 intrinsics -#include -#ifdef __SSE3__ // Intel's SSE 3 -# include -#endif -#ifdef __SSE4_1__ // Intel's SSE 4.1 -# include -#endif -#ifdef __SSE4A__ // AMD's SSE 4a -# include -#endif -#if defined __AVX__ // Intel's AVX -# include -#endif - - - -namespace vecmathlib { - -#define VECMATHLIB_HAVE_VEC_DOUBLE_2 - template<> struct boolvec; - template<> struct intvec; - template<> struct realvec; - - - - template<> - struct boolvec: floatprops - { - static int const size = 2; - typedef bool scalar_t; - typedef __m128d bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values have the sign bit set, false values have it unset - static uint_t from_bool(bool a) { return - uint_t(a); } - static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } - public: - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): - v(_mm_castsi128_pd(_mm_set1_epi64x(from_bool(a)))) {} - boolvec(bool const* as): - v(_mm_castsi128_pd(_mm_set_epi64x(from_bool(as[1]), from_bool(as[0])))) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } - boolvec& set_elt(int n, bool a) - { - return ((uint_t*)&v)[n]=from_bool(a), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return _mm_xor_pd(boolvec(true), v); } - - boolvec operator&&(boolvec x) const { return _mm_and_pd(v, x.v); } - boolvec operator||(boolvec x) const { return _mm_or_pd(v, x.v); } - boolvec operator==(boolvec x) const { return !(*this!=x); } - boolvec operator!=(boolvec x) const { return _mm_xor_pd(v, x.v); } - - bool all() const - { - // return (*this)[0] && (*this)[1]; -#if defined __AVX__ - return ! (! *this).any(); -#else - boolvec x = *this; - x = x && _mm_shuffle_pd(x.v, x.v, _MM_SHUFFLE2(0,1)); - return x[0]; -#endif - } - bool any() const - { - // return (*this)[0] || (*this)[1]; -#if defined __AVX__ - return ! _mm_testz_pd(v, v); -#else - boolvec x = *this; - x = x || _mm_shuffle_pd(x.v, x.v, _MM_SHUFFLE2(0,1)); - return x[0]; -#endif - } - - - - // ifthen(condition, then-value, else-value) - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec: floatprops - { - static int const size = 2; - typedef int_t scalar_t; - typedef __m128i ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm_set1_epi64x(a)) {} - intvec(int_t const* as): v(_mm_set_epi64x(as[1], as[0])) {} - static intvec iota() { return _mm_set_epi64x(1, 0); } - - operator ivector_t() const { return v; } - int_t operator[](int n) const { return ((int_t const*)&v)[n]; } - intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } - - - - boolvec_t as_bool() const { return _mm_castsi128_pd(v); } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare with zero. Instead, we check - // whether x is positive and x-1 is negative. - intvec x = *this; - // We know that boolvec values depend only on the sign bit - // return (~(x-1) | x).as_bool(); - // return x.as_bool() || !(x-1).as_bool(); - return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec operator+() const { return *this; } - intvec operator-() const { return IV(I(0)) - *this; } - - intvec operator+(intvec x) const { return _mm_add_epi64(v, x.v); } - intvec operator-(intvec x) const { return _mm_sub_epi64(v, x.v); } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - - - - intvec operator~() const { return IV(~U(0)) ^ *this; } - - intvec operator&(intvec x) const - { - return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(v), - _mm_castsi128_pd(x.v))); - } - intvec operator|(intvec x) const - { - return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(v), - _mm_castsi128_pd(x.v))); - } - intvec operator^(intvec x) const - { - return _mm_castpd_si128(_mm_xor_pd(_mm_castsi128_pd(v), - _mm_castsi128_pd(x.v))); - } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - - - intvec lsr(int_t n) const { return _mm_srli_epi64(v, n); } - intvec operator>>(int_t n) const - { - // There is no _mm_srai_epi64. To emulate it, add 0x80000000 - // before shifting, and subtract the shifted 0x80000000 after - // shifting - intvec x = *this; - // Convert signed to unsiged - x += U(1) << (bits-1); - // Shift - x = x.lsr(n); - // Undo conversion - x -= U(1) << (bits-1-n); - return x; - } - intvec operator<<(int_t n) const { return _mm_slli_epi64(v, n); } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); - } - return r; - } - intvec operator>>(intvec n) const - { - intvec r; - for (int i=0; i> n[i]); - } - return r; - } - intvec operator<<(intvec n) const - { - intvec r; - for (int i=0; i>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this< x); - } - boolvec_t operator>(intvec const& x) const - { - return x < *this; - } - boolvec_t operator>=(intvec const& x) const - { - return ! (*this < x); - } - }; - - - - template<> - struct realvec: floatprops - { - static int const size = 2; - typedef real_t scalar_t; - typedef __m128d vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return ""; } - void barrier() { __asm__("": "+x" (v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm_set1_pd(a)) {} - realvec(real_t const* as): v(_mm_set_pd(as[1], as[0])) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const { return ((real_t const*)&v)[n]; } - realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } - - - - typedef vecmathlib::mask_t mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm_load_pd(p); - } - static realvec_t loadu(real_t const* p) - { - return _mm_loadu_pd(p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm_store_pd(p, v); - } - void storeu(real_t* p) const - { - return _mm_storeu_pd(p, v); - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { -#if defined __AVX__ - _mm_maskstore_pd(p, m.m.as_int(), v); -#else - if (m.m[0]) _mm_storel_pd(p , v); - else if (m.m[1]) _mm_storeh_pd(p+1, v); -#endif - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - if (m.m[0]) _mm_storel_pd(p , v); - else if (m.m[1]) _mm_storeh_pd(p+1, v); - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return _mm_castpd_si128(v); } - intvec_t convert_int() const { return MF::vml_convert_int(*this); } - - - - realvec operator+() const { return *this; } - realvec operator-() const { return RV(0.0) - *this; } - - realvec operator+(realvec x) const { return _mm_add_pd(v, x.v); } - realvec operator-(realvec x) const { return _mm_sub_pd(v, x.v); } - realvec operator*(realvec x) const { return _mm_mul_pd(v, x.v); } - realvec operator/(realvec x) const { return _mm_div_pd(v, x.v); } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t prod() const - { - return (*this)[0] * (*this)[1]; - } - real_t sum() const - { -#ifdef __SSE3__ - return _mm_cvtsd_f64(_mm_hadd_pd(v, v)); -#else - return (*this)[0] + (*this)[1]; -#endif - } - - - - boolvec_t operator==(realvec const& x) const - { - return _mm_cmpeq_pd(v, x.v); - } - boolvec_t operator!=(realvec const& x) const - { - return _mm_cmpneq_pd(v, x.v); - } - boolvec_t operator<(realvec const& x) const - { - return _mm_cmplt_pd(v, x.v); - } - boolvec_t operator<=(realvec const& x) const - { - return _mm_cmple_pd(v, x.v); - } - boolvec_t operator>(realvec const& x) const - { - return _mm_cmpgt_pd(v, x.v); - } - boolvec_t operator>=(realvec const& x) const - { - return _mm_cmpge_pd(v, x.v); - } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const - { -#ifdef __SSE4_1__ - return _mm_ceil_pd(v); -#else - return MF::vml_ceil(*this); -#endif - } - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return MF::vml_fabs(*this); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const - { -#ifdef __SSE4_1__ - return _mm_floor_pd(v); -#else - return MF::vml_floor(*this); -#endif - } - realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); } - realvec fmax(realvec y) const { return _mm_max_pd(v, y.v); } - realvec fmin(realvec y) const { return _mm_min_pd(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return _mm_cmpunord_pd(v, v);; } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const { return _mm_div_pd(_mm_set1_pd(1.0), v); } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const - { -#ifdef __SSE4_1__ - return _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT); -#else - return MF::vml_rint(*this); -#endif - } - realvec round() const { return MF::vml_round(*this); } - realvec rsqrt() const { return MF::vml_rsqrt(*this); } - boolvec_t signbit() const { return v; } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - realvec sqrt() const { return _mm_sqrt_pd(v); } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const - { -#ifdef __SSE4_1__ - return _mm_round_pd(v, _MM_FROUND_TO_ZERO); -#else - return MF::vml_trunc(*this); -#endif - } - }; - - - - // boolvec definitions - - inline - auto boolvec::as_int() const -> intvec_t - { - return _mm_castpd_si128(v); - } - - inline - auto boolvec::convert_int() const -> intvec_t - { - //return ifthen(v, U(1), U(0)); - return lsr(as_int(), bits-1); - } - - inline - auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t - { - return ifthen(x.as_float(), y.as_float()).as_int(); - } - - inline - auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t - { -#ifdef __SSE4_1__ - return _mm_blendv_pd(y.v, x.v, v); -#else - return (( -convert_int() & x.as_int()) | - (~-convert_int() & y.as_int())).as_float(); -#endif - } - - - - // intvec definitions - - inline auto intvec::as_float() const -> realvec_t - { - return _mm_castsi128_pd(v); - } - - inline auto intvec::convert_float() const -> realvec_t - { - return MF::vml_convert_float(*this); - } - -} // namespace vecmathlib - -#endif // #ifndef VEC_DOUBLE_SSE2_H diff --git a/vec_double_sse2_scalar.h b/vec_double_sse2_scalar.h deleted file mode 100644 index 4c3b4b6..0000000 --- a/vec_double_sse2_scalar.h +++ /dev/null @@ -1,528 +0,0 @@ -// -*-C++-*- - -#ifndef VEC_DOUBLE_SSE2_SCALAR_H -#define VEC_DOUBLE_SSE2_SCALAR_H - -#include "floatprops.h" -#include "mathfuncs.h" -#include "vec_base.h" - -#include - -// SSE2 intrinsics -#include -#ifdef __SSE3__ // Intel's SSE 3 -# include -#endif -#ifdef __SSE4_1__ // Intel's SSE 4.1 -# include -#endif -#ifdef __SSE4A__ // AMD's SSE 4a -# include -#endif -#if defined __AVX__ // Intel's AVX -# include -#endif - - - -namespace vecmathlib { - -#define VECMATHLIB_HAVE_VEC_DOUBLE_1 - template<> struct boolvec; - template<> struct intvec; - template<> struct realvec; - - - - template<> - struct boolvec: floatprops - { - static int const size = 1; - typedef bool scalar_t; - typedef uint_t bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - // true values are non-zero, false values are zero - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(a) {} - // TODO: remove this - boolvec(int x): v(x) {} - boolvec(bool const* as): v(as[0]) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const { return v; } - boolvec& set_elt(int n, bool a) { return v=a, *this; } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return !v; } - - boolvec operator&&(boolvec x) const { return v && x.v; } - boolvec operator||(boolvec x) const { return v || x.v; } - boolvec operator==(boolvec x) const { return bool(v) == bool(x.v); } - boolvec operator!=(boolvec x) const { return bool(v) != bool(x.v); } - - bool all() const { return v; } - bool any() const { return v; } - - - - // ifthen(condition, then-value, else-value) - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec: floatprops - { - static int const size = 1; - typedef int_t scalar_t; - typedef int_t ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(int_t a): v(a) {} - intvec(int_t const* as): v(as[0]) {} - static intvec iota() { return intvec(I(0)); } - - operator ivector_t() const { return v; } - int_t operator[](int n) const { return v; } - intvec& set_elt(int n, int_t a) { return v=a, *this; } - - - - boolvec_t as_bool() const { return U(v); } - boolvec_t convert_bool() const { return bool(v); } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - intvec operator+() const { return +v; } - intvec operator-() const { return -v; } - - intvec operator+(intvec x) const { return v+x.v; } - intvec operator-(intvec x) const { return v-x.v; } - intvec operator*(intvec x) const { return v*x.v; } - intvec operator/(intvec x) const { return v/x.v; } - intvec operator%(intvec x) const { return v%x.v; } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - intvec& operator*=(intvec const& x) { return *this=*this*x; } - intvec& operator/=(intvec const& x) { return *this=*this/x; } - intvec& operator%=(intvec const& x) { return *this=*this%x; } - - - - intvec operator~() const { return ~v; } - - intvec operator&(intvec x) const { return v&x.v; } - intvec operator|(intvec x) const { return v|x.v; } - intvec operator^(intvec x) const { return v^x.v; } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - - - intvec lsr(int_t n) const { return U(v) >> U(n); } - intvec operator>>(int_t n) const { return v>>n; } - intvec operator<<(int_t n) const { return v<>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<> U(n); } - intvec operator>>(intvec n) const { return v>>n; } - intvec operator<<(intvec n) const { return v<>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<(intvec const& x) const { return v>x.v; } - boolvec_t operator>=(intvec const& x) const { return v>=x.v; } - }; - - - - template<> - struct realvec: floatprops - { - static int const size = 1; - typedef real_t scalar_t; - typedef double vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return ""; } - void barrier() { __asm__("": "+x" (v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - private: - static __m128d from_double(double a) { return _mm_set_sd(a); } - static double to_double(__m128d a) { return _mm_cvtsd_f64(a); } - public: - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(real_t a): v(a) {} - realvec(real_t const* as): v(as[0]) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const { return v; } - realvec& set_elt(int n, real_t a) { return v=a, *this; } - - - - typedef vecmathlib::mask_t mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return *p; - } - static realvec_t loadu(real_t const* p) - { - return *p; - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loada(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return *this; - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return *this; - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loada(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - *p = v; - } - void storeu(real_t* p) const - { - *p = v; - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storea(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storea(p+ioff, m); - } - - - - intvec_t as_int() const { return floatprops::as_int(v); } - intvec_t convert_int() const { -#ifdef __x86_64__ - return _mm_cvttsd_si64(_mm_set_sd(v)); -#else - return floatprops::convert_int(v); -#endif - } - - - - realvec operator+() const { return +v; } - realvec operator-() const { return -v; } - - realvec operator+(realvec x) const { return v+x.v; } - realvec operator-(realvec x) const { return v-x.v; } - realvec operator*(realvec x) const { return v*x.v; } - realvec operator/(realvec x) const { return v/x.v; } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t prod() const { return v; } - real_t sum() const { return v; } - - - - boolvec_t operator==(realvec const& x) const { return v==x.v; } - boolvec_t operator!=(realvec const& x) const { return v!=x.v; } - boolvec_t operator<(realvec const& x) const { return v(realvec const& x) const { return v>x.v; } - boolvec_t operator>=(realvec const& x) const { return v>=x.v; } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const - { -#ifdef __SSE4_1__ - return to_double(_mm_ceil_sd(from_double(v), from_double(v))); -#else - return std::ceil(v); -#endif - } - realvec copysign(realvec y) const { return std::copysign(v, y.v); } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return std::fabs(v); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const - { -#ifdef __SSE4_1__ - return to_double(_mm_floor_sd(from_double(v), from_double(v))); -#else - return std::floor(v); -#endif - } - realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); } - realvec fmax(realvec y) const - { - return to_double(_mm_max_sd(from_double(v), from_double(y.v))); - } - realvec fmin(realvec y) const - { - return to_double(_mm_min_sd(from_double(v), from_double(y.v))); - } - realvec fmod(realvec y) const { return std::fmod(v, y.v); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const - { - int_t r = std::ilogb(v); - if (r == FP_ILOGB0) r = numeric_limits::min(); - else if (r == FP_ILOGBNAN) r = numeric_limits::max(); - return r; - } - boolvec_t isfinite() const { return std::isfinite(v); } - boolvec_t isinf() const { return std::isinf(v); } - boolvec_t isnan() const - { - return _mm_ucomineq_sd(from_double(v), from_double(v)); - } - boolvec_t isnormal() const { return std::isnormal(v); } - realvec ldexp(int_t n) const { return std::ldexp(v, n); } - realvec ldexp(intvec_t n) const { return std::ldexp(v, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const { return R(1.0)/v; } - realvec remainder(realvec y) const { return std::remainder(v, y.v); } - realvec rint() const - { -#ifdef __SSE4_1__ - return to_double(_mm_round_sd(from_double(v), from_double(v), - _MM_FROUND_TO_NEAREST_INT)); -#else - return MF::vml_rint(*this); -#endif - } - realvec round() const { return MF::vml_round(*this); } - realvec rsqrt() const { return MF::vml_rsqrt(*this); } - boolvec_t signbit() const { return std::signbit(v); } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - realvec sqrt() const - { - return to_double(_mm_sqrt_sd(from_double(v), from_double(v))); - } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const - { -#ifdef __SSE4_1__ - return to_double(_mm_round_sd(from_double(v), from_double(v), - _MM_FROUND_TO_ZERO)); -#else - return MF::vml_trunc(*this); -#endif - } - }; - - - - // boolvec definitions - - inline - auto boolvec::as_int() const -> intvec_t - { - return I(v); - } - - inline - auto boolvec::convert_int() const -> intvec_t - { - return v; - } - - inline - auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t - { - return v ? x : y; - } - - inline - auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t - { - return v ? x : y; - } - - - - // intvec definitions - - inline auto intvec::as_float() const -> realvec_t - { - return FP::as_float(v); - } - - inline auto intvec::convert_float() const -> realvec_t - { -#ifdef __x86_64__ - return _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(), v)); -#else - return FP::convert_float(v); -#endif - } - -} // namespace vecmathlib - -#endif // #ifndef VEC_DOUBLE_SSE2_SCALAR_H diff --git a/vec_double_vsx.h b/vec_double_vsx.h deleted file mode 100644 index 67b4b36..0000000 --- a/vec_double_vsx.h +++ /dev/null @@ -1,656 +0,0 @@ -// -*-C++-*- - -#ifndef VEC_DOUBLE_VSX_H -#define VEC_DOUBLE_VSX_H - -#include "floatprops.h" -#include "mathfuncs.h" -#include "vec_base.h" - -#include - -// VSX intrinsics -#include -#undef vector -#undef pixel -#undef bool - - - -namespace vecmathlib { - -#define VECMATHLIB_HAVE_VEC_DOUBLE_2 - template<> struct boolvec; - template<> struct intvec; - template<> struct realvec; - - - - template<> - struct boolvec: floatprops - { - static int const size = 2; - typedef bool scalar_t; - typedef __vector __bool long long bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values are -1, false values are 0 - // truth values are interpreted bit-wise - static uint_t from_bool(bool a) { return -int_t(a); } - static bool to_bool(uint_t a) { return a; } - public: - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(vec_splats(from_bool(a))) {} - boolvec(bool const* as) - { - for (int d=0; d - struct intvec: floatprops - { - static int const size = 2; - typedef int_t scalar_t; - typedef __vector long long ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(vec_splats(a)) {} - intvec(int_t const* as) - { - for (int d=0; d 1436 - // exchange pairs - static __vector unsigned char perm_int_swap() - { - return - (__vector unsigned char) - {4,5,6,7, 16,17,18,19, 12,13,14,15, 24,25,26,27}; - } - // 0123 4567 -> 0426 - // broadcast high elements of pairs - static __vector unsigned char perm_int_bchi() - { - return - (__vector unsigned char) - {0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27}; - } - public: - - - - intvec operator+() const { return *this; } - intvec operator-() const { return IV(I(0)) - *this; } - - intvec operator+(intvec x) const - { - // return vec_add(v, x.v); - __vector unsigned int a = (__vector unsigned int)v; - __vector unsigned int b = (__vector unsigned int)x.v; - __vector unsigned int s = vec_add(a, b); - __vector unsigned int c = vec_addc(a, b); - __vector unsigned int z = vec_xor(z, z); - c = vec_perm(c, z, perm_int_swap()); - s = vec_add(s, c); - return (__vector long long)s; - } - intvec operator-(intvec x) const - { - // return vec_sub(v, x.v); - __vector unsigned int a = (__vector unsigned int)v; - __vector unsigned int b = (__vector unsigned int)x.v; - __vector unsigned int d = vec_sub(a, b); - __vector unsigned int c = vec_subc(a, b); - c = vec_sub(vec_splats(1U), c); - __vector unsigned int z = vec_xor(z, z); - c = vec_perm(c, z, perm_int_swap()); - d = vec_sub(d, c); - return (__vector long long)d; - } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - - - - intvec operator~() const - { - return (__vector long long)vec_nor((__vector int)v, (__vector int)v); - } - - intvec operator&(intvec x) const - { - return (__vector long long)vec_and((__vector int)v, (__vector int)x.v); - } - intvec operator|(intvec x) const - { - return (__vector long long)vec_or ((__vector int)v, (__vector int)x.v); - } - intvec operator^(intvec x) const - { - return (__vector long long)vec_xor((__vector int)v, (__vector int)x.v); - } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - - - intvec lsr(int_t n) const { return lsr(IV(n)); } - intvec operator>>(int_t n) const { return *this >> IV(n); } - intvec operator<<(int_t n) const { return *this << IV(n); } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); - } - return r; - } - intvec operator>>(intvec n) const - { - // return vec_sra(v, (__vector unsigned long long)n.v); - intvec r; - for (int i=0; i> n[i]); - } - return r; - } - intvec operator<<(intvec n) const - { - // return vec_sl(v, (__vector unsigned long long)n.v); - intvec r; - for (int i=0; i>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<> (bits-1)).as_bool(); - } - - boolvec_t operator==(intvec const& x) const - { - // return vec_cmpeq(v, x.v); - __vector int a = (__vector int)v; - __vector int b = (__vector int)x.v; - __vector __bool int c = vec_cmpeq(a, b); - __vector __bool int cx = vec_perm(c, c, perm_int_swap()); - __vector __bool int r = vec_and(c, cx); - return (__vector __bool long long)r; - } - boolvec_t operator!=(intvec const& x) const { return !(*this == x); } - boolvec_t operator<(intvec const& x) const - { - __vector int a = (__vector int)v; - __vector int b = (__vector int)x.v; - __vector __bool int lt = vec_cmplt(a, b); - __vector __bool int eq = vec_cmpeq(a, b); - __vector unsigned int ua = (__vector unsigned int)v; - __vector unsigned int ub = (__vector unsigned int)x.v; - __vector __bool int ult = vec_cmplt(ua, ub); - __vector __bool int ultx = vec_perm(ult, ult, perm_int_swap()); - __vector __bool int r = vec_or(lt, vec_and(eq, ultx)); - r = vec_perm(r, r, perm_int_bchi()); - return (__vector __bool long long)r; - } - boolvec_t operator<=(intvec const& x) const - { - return ! (*this > x); - } - boolvec_t operator>(intvec const& x) const - { - return x < *this; - } - boolvec_t operator>=(intvec const& x) const - { - return ! (*this < x); - } - }; - - - - template<> - struct realvec: floatprops - { - static int const size = 2; - typedef real_t scalar_t; - typedef __vector double vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return ""; } - void barrier() { __asm__("": "+v" (v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(vec_splats(a)) {} - realvec(real_t const* as) - { - for (int d=0; d mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return vec_ld(0, (const __vector double*)p); - } - static realvec_t loadu(real_t const* p) - { - realvec_t v0 = vec_ld(0, (const __vector double*)p); - realvec_t v1 = vec_ld(15, (const __vector double*)p); - return vec_perm(v0.v, v1.v, vec_lvsl(0, p)); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - vec_st(v, 0, (__vector double*)p); - } - void storeu(real_t* p) const - { - // Vector stores would require vector loads, which would need to - // be atomic - // TODO: see for good ideas - p[0] = (*this)[0]; - p[1] = (*this)[1]; - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - // Use vec_ste? - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - // Use vec_ste? - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return (__vector long long) v; } - intvec_t convert_int() const { return MF::vml_convert_int(*this); } - - - - realvec operator+() const { return *this; } - realvec operator-() const { return RV(0.0) - *this; } - - realvec operator+(realvec x) const { return vec_add(v, x.v); } - realvec operator-(realvec x) const { return vec_sub(v, x.v); } - realvec operator*(realvec x) const { return vec_mul(v, x.v); } - realvec operator/(realvec x) const { return vec_div(v, x.v); } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t prod() const - { - return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; - } - real_t sum() const - { - return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; - } - - - - boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); } - boolvec_t operator!=(realvec const& x) const { return ! (*this == x); } - boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); } - boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); } - boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); } - boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const { return vec_ceil(v); } - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return vec_abs(v); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const { return vec_floor(v); } - realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); } - realvec fmax(realvec y) const { return vec_max(v, y.v); } - realvec fmin(realvec y) const { return vec_min(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return MF::vml_isnan(*this); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const - { - realvec x = *this; - realvec r = vec_re(v); // this is only an approximation - // TODO: use fma - // Note: don't rewrite this expression, this may introduce - // cancellation errors - r += r * (RV(1.0) - x*r); // two Newton iterations (see vml_rcp) - r += r * (RV(1.0) - x*r); - return r; - } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const { return vec_rint(v); } - realvec round() const { return MF::vml_round(*this); } - realvec rsqrt() const - { - // realvec x = *this; - // realvec r = vec_rsqrte(x.v); // this is only an approximation - // // TODO: use fma - // // one Newton iteration (see vml_rsqrt) - // r += RV(0.5)*r * (RV(1.0) - x * r*r); - // return r; - return vec_rsqrt(v); - } - boolvec_t signbit() const { return MF::vml_signbit(*this); } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - // realvec sqrt() const { return *this * rsqrt(); } - realvec sqrt() const { return vec_sqrt(v); } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const { return vec_trunc(v); } - }; - - - - // boolvec definitions - - inline - auto boolvec::as_int() const -> intvec_t - { - return (__vector long long) v; - } - - inline - auto boolvec::convert_int() const -> intvec_t - { - return -(__vector long long)v; - } - - inline - auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t - { - return vec_sel(y.v, x.v, v); - } - - inline - auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t - { - return vec_sel(y.v, x.v, v); - } - - - - // intvec definitions - - inline auto intvec::as_float() const -> realvec_t - { - return (__vector double)v; - } - - inline auto intvec::convert_float() const -> realvec_t - { - // return vec_ctd(v, 0); - return MF::vml_convert_float(*this); - } - -} // namespace vecmathlib - -#endif // #ifndef VEC_DOUBLE_VSX_H diff --git a/vec_float_altivec.h b/vec_float_altivec.h deleted file mode 100644 index 1fdcbb4..0000000 --- a/vec_float_altivec.h +++ /dev/null @@ -1,553 +0,0 @@ -// -*-C++-*- - -#ifndef VEC_FLOAT_ALTIVEC_H -#define VEC_FLOAT_ALTIVEC_H - -#include "floatprops.h" -#include "mathfuncs.h" -#include "vec_base.h" - -#include - -// Altivec intrinsics -#include -#undef vector -#undef pixel -#undef bool - - - -namespace vecmathlib { - -#define VECMATHLIB_HAVE_VEC_FLOAT_4 - template<> struct boolvec; - template<> struct intvec; - template<> struct realvec; - - - - template<> - struct boolvec: floatprops - { - static int const size = 4; - typedef bool scalar_t; - typedef __vector __bool int bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values are -1, false values are 0 - static uint_t from_bool(bool a) { return -int_t(a); } - static bool to_bool(uint_t a) { return a; } - public: - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(vec_splats(from_bool(a))) {} - boolvec(bool const* as) - { - for (int d=0; d - struct intvec: floatprops - { - static int const size = 4; - typedef int_t scalar_t; - typedef __vector int ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(vec_splats(a)) {} - intvec(int_t const* as) - { - for (int d=0; d>(int_t n) const { return *this >> IV(n); } - intvec operator<<(int_t n) const { return *this << IV(n); } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<>(intvec n) const - { - return vec_sra(v, (__vector unsigned int)n.v); - } - intvec operator<<(intvec n) const - { - return vec_sl(v, (__vector unsigned int)n.v); - } - intvec& operator>>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this< x); } - boolvec_t operator>(intvec const& x) const { return vec_cmpgt(v, x.v); } - boolvec_t operator>=(intvec const& x) const { return !(*this < x); } - }; - - - - template<> - struct realvec: floatprops - { - static int const size = 4; - typedef real_t scalar_t; - typedef __vector float vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return ""; } - void barrier() { __asm__("": "+v" (v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(vec_splats(a)) {} - realvec(real_t const* as) - { - for (int d=0; d mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return vec_ld(0, p); - } - static realvec_t loadu(real_t const* p) - { - realvec_t v0 = vec_ld(0, p); - realvec_t v1 = vec_ld(15, p); - return vec_perm(v0.v, v1.v, vec_lvsl(0, p)); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - vec_st(v, 0, p); - } - void storeu(real_t* p) const - { - // Vector stores would require vector loads, which would need to - // be atomic - // TODO: see for good ideas - p[0] = (*this)[0]; - p[1] = (*this)[1]; - p[2] = (*this)[2]; - p[3] = (*this)[3]; - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - // Use vec_ste? - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - // Use vec_ste? - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return (__vector int) v; } - intvec_t convert_int() const { return vec_cts(v, 0); } - - - - realvec operator+() const { return *this; } - realvec operator-() const { return RV(0.0) - *this; } - - realvec operator+(realvec x) const { return vec_add(v, x.v); } - realvec operator-(realvec x) const { return vec_sub(v, x.v); } - realvec operator*(realvec x) const { -#if defined __VSX__ - return vec_mul(v, x.v); -#else - return vec_madd(v, x.v, RV(0.0).v); -#endif - } - realvec operator/(realvec x) const { -#if defined __VSX__ - return vec_div(v, x.v); -#else - return *this * x.rcp(); -#endif - } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t prod() const - { - return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; - } - real_t sum() const - { - return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; - } - - - - boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); } - boolvec_t operator!=(realvec const& x) const { return ! (*this == x); } - boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); } - boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); } - boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); } - boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const { return vec_ceil(v); } - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return vec_abs(v); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const { return vec_floor(v); } - realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); } - realvec fmax(realvec y) const { return vec_max(v, y.v); } - realvec fmin(realvec y) const { return vec_min(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return MF::vml_isnan(*this); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const - { - realvec x = *this; - realvec r = vec_re(v); // this is only an approximation - // TODO: use fma - // Note: don't rewrite this expression, this may introduce - // cancellation errors - r += r * (RV(1.0) - x*r); // one Newton iteration (see vml_rcp) - return r; - } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const { return vec_round(v); } - realvec round() const { return MF::vml_round(*this); } - realvec rsqrt() const - { -#if defined __VSX__ - return vec_rsqrt(v); -#else - realvec x = *this; - realvec r = vec_rsqrte(x.v); // this is only an approximation - // TODO: use fma - // one Newton iteration (see vml_rsqrt) - r += RV(0.5)*r * (RV(1.0) - x * r*r); - return r; -#endif - } - boolvec_t signbit() const { return MF::vml_signbit(*this); } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - realvec sqrt() const { -#if defined __VSX__ - return vec_sqrt(v); -#else - return *this * rsqrt(); -#endif - } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const { return vec_trunc(v); } - }; - - - - // boolvec definitions - - inline - auto boolvec::as_int() const -> intvec_t - { - return (__vector int) v; - } - - inline - auto boolvec::convert_int() const -> intvec_t - { - return -(__vector int)v; - } - - inline - auto boolvec::operator==(boolvec x) const -> boolvec_t - { - return as_int() == x.as_int(); - } - - inline - auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t - { - return vec_sel(y.v, x.v, v); - } - - inline - auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t - { - return vec_sel(y.v, x.v, v); - } - - - - // intvec definitions - - inline auto intvec::as_float() const -> realvec_t - { - return (__vector float)v; - } - - inline auto intvec::convert_float() const -> realvec_t - { - return vec_ctf(v, 0); - } - -} // namespace vecmathlib - -#endif // #ifndef VEC_FLOAT_ALTIVEC_H diff --git a/vec_float_avx.h b/vec_float_avx.h deleted file mode 100644 index 6c4f1ee..0000000 --- a/vec_float_avx.h +++ /dev/null @@ -1,646 +0,0 @@ -// -*-C++-*- - -#ifndef VEC_FLOAT_AVX_H -#define VEC_FLOAT_AVX_H - -#include "floatprops.h" -#include "mathfuncs.h" -#include "vec_base.h" - -#include - -// AVX intrinsics -#include - - - -namespace vecmathlib { - -#define VECMATHLIB_HAVE_VEC_FLOAT_8 - template<> struct boolvec; - template<> struct intvec; - template<> struct realvec; - - - - template<> - struct boolvec: floatprops - { - static int const size = 8; - typedef bool scalar_t; - typedef __m256 bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values have the sign bit set, false values have it unset - static uint_t from_bool(bool a) { return - uint_t(a); } - static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } - public: - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): - v(_mm256_castsi256_ps(_mm256_set1_epi32(from_bool(a)))) {} - boolvec(bool const* as): - v(_mm256_castsi256_ps(_mm256_set_epi32(from_bool(as[7]), - from_bool(as[6]), - from_bool(as[5]), - from_bool(as[4]), - from_bool(as[3]), - from_bool(as[2]), - from_bool(as[1]), - from_bool(as[0])))) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } - boolvec& set_elt(int n, bool a) - { - return ((uint_t*)&v)[n]=from_bool(a), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return _mm256_xor_ps(boolvec(true), v); } - - boolvec operator&&(boolvec x) const { return _mm256_and_ps(v, x.v); } - boolvec operator||(boolvec x) const { return _mm256_or_ps(v, x.v); } - boolvec operator==(boolvec x) const { return !(*this!=x); } - boolvec operator!=(boolvec x) const { return _mm256_xor_ps(v, x.v); } - - bool all() const - { - // return - // (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3] && - // (*this)[4] && (*this)[5] && (*this)[6] && (*this)[7]; - return ! (! *this).any(); - } - bool any() const - { - // return - // (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3] || - // (*this)[4] || (*this)[5] || (*this)[6] || (*this)[7]; - return ! _mm256_testz_ps(v, v); - } - - - - // ifthen(condition, then-value, else-value) - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec: floatprops - { - static int const size = 8; - typedef int_t scalar_t; - typedef __m256i ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm256_set1_epi32(a)) {} - intvec(int_t const* as): v(_mm256_set_epi32(as[7], as[6], as[5], as[4], - as[3], as[2], as[1], as[0])) {} - static intvec iota() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); } - - operator ivector_t() const { return v; } - int_t operator[](int n) const { return ((int_t const*)&v)[n]; } - intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } - - - - boolvec_t as_bool() const { return _mm256_castsi256_ps(v); } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare with zero. Instead, we check - // whether x is positive and x-1 is negative. - intvec x = *this; - // We know that boolvec values depend only on the sign bit - // return (~(x-1) | x).as_bool(); - // return x.as_bool() || !(x-1).as_bool(); - return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec operator+() const { return *this; } - intvec operator-() const { return IV(0) - *this; } - - intvec operator+(intvec x) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_add_epi32(vlo, xvlo); - vhi = _mm_add_epi32(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec operator-(intvec x) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_sub_epi32(vlo, xvlo); - vhi = _mm_sub_epi32(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - - - - intvec operator~() const { return IV(~U(0)) ^ *this; } - - intvec operator&(intvec x) const - { - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - intvec operator|(intvec x) const - { - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - intvec operator^(intvec x) const - { - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - - - intvec lsr(int_t n) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_srli_epi32(vlo, n); - vhi = _mm_srli_epi32(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec operator>>(int_t n) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_srai_epi32(vlo, n); - vhi = _mm_srai_epi32(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec operator<<(int_t n) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_slli_epi32(vlo, n); - vhi = _mm_slli_epi32(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); - } - return r; - } - intvec operator>>(intvec n) const - { - intvec r; - for (int i=0; i> n[i]); - } - return r; - } - intvec operator<<(intvec n) const - { - intvec r; - for (int i=0; i>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this< x); - } - boolvec_t operator>(intvec const& x) const - { - return x < *this; - } - boolvec_t operator>=(intvec const& x) const - { - return ! (*this < x); - } - }; - - - - template<> - struct realvec: floatprops - { - static int const size = 8; - typedef real_t scalar_t; - typedef __m256 vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return ""; } - void barrier() { __asm__("": "+x" (v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm256_set1_ps(a)) {} - realvec(real_t const* as): v(_mm256_set_ps(as[7], as[6], as[5], as[4], - as[3], as[2], as[1], as[0])) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const { return ((real_t const*)&v)[n]; } - realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } - - - - typedef vecmathlib::mask_t mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm256_load_ps(p); - } - static realvec_t loadu(real_t const* p) - { - return _mm256_loadu_ps(p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm256_store_ps(p, v); - } - void storeu(real_t* p) const - { - return _mm256_storeu_ps(p, v); - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - _mm256_maskstore_ps(p, m.m.as_int(), v); - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - // TODO: this is expensive - for (int n=0; n(realvec const& x) const - { - return _mm256_cmp_ps(v, x.v, _CMP_GT_OQ); - } - boolvec_t operator>=(realvec const& x) const - { - return _mm256_cmp_ps(v, x.v, _CMP_GE_OQ); - } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const { return _mm256_ceil_ps(v); } - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return MF::vml_fabs(*this); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const { return _mm256_floor_ps(v); } - realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); } - realvec fmax(realvec y) const { return _mm256_max_ps(v, y.v); } - realvec fmin(realvec y) const { return _mm256_min_ps(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return _mm256_cmp_ps(v, v, _CMP_UNORD_Q); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const - { - realvec x = *this; - realvec r = _mm256_rcp_ps(x); // this is only an approximation - r *= RV(2.0) - r*x; // one Newton iteration (see vml_rcp) - return r; - } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const - { - return _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT); - } - realvec round() const { return MF::vml_round(*this); } - realvec rsqrt() const - { - realvec x = *this; - realvec r = _mm256_rsqrt_ps(x); // this is only an approximation - r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt) - return r; - } - boolvec_t signbit() const { return v; } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - realvec sqrt() const { return _mm256_sqrt_ps(v); } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const { return _mm256_round_ps(v, _MM_FROUND_TO_ZERO); } - }; - - - - // boolvec definitions - - inline - auto boolvec::as_int() const -> intvec_t - { - return _mm256_castps_si256(v); - } - - inline - auto boolvec::convert_int() const -> intvec_t - { - return lsr(as_int(), bits-1); - } - - inline - auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t - { - return ifthen(x.as_float(), y.as_float()).as_int(); - } - - inline - auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t - { - return _mm256_blendv_ps(y.v, x.v, v); - } - - - - // intvec definitions - - inline auto intvec::as_float() const -> realvec_t - { - return _mm256_castsi256_ps(v); - } - - inline auto intvec::convert_float() const -> realvec_t - { - return _mm256_cvtepi32_ps(v); - } - -} // namespace vecmathlib - -#endif // #ifndef VEC_FLOAT_AVX_H diff --git a/vec_float_neon.h b/vec_float_neon.h deleted file mode 100644 index fccc10f..0000000 --- a/vec_float_neon.h +++ /dev/null @@ -1,558 +0,0 @@ -// -*-C++-*- - -// - -#ifndef VEC_FLOAT_NEON_H -#define VEC_FLOAT_NEON_H - -#include "floatprops.h" -#include "mathfuncs.h" -#include "vec_base.h" - -#include - -// Neon intrinsics -#include - - - -namespace vecmathlib { - -#define VECMATHLIB_HAVE_VEC_FLOAT_2 - template<> struct boolvec; - template<> struct intvec; - template<> struct realvec; - - - - template<> - struct boolvec: floatprops - { - static int const size = 2; - typedef bool scalar_t; - typedef uint32x2_t bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values are -1, false values are 0 - static uint_t from_bool(bool a) { return -int_t(a); } - static bool to_bool(uint_t a) { return a; } - public: - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(vdup_n_u32(from_bool(a))) {} - boolvec(bool const* as) - { - for (int d=0; d - struct intvec: floatprops - { - static int const size = 2; - typedef int_t scalar_t; - typedef int32x2_t ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(vdup_n_s32(a)) {} - intvec(int_t const* as) - { - for (int d=0; d>(int_t n) const { return *this >> IV(n); } - intvec operator<<(int_t n) const { return *this << IV(n); } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<>(intvec n) const - { - return vshl_s32(v, (-n).v); - } - intvec operator<<(intvec n) const - { - return vshl_s32(v, n.v); - } - intvec& operator>>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<(intvec const& x) const { return vcgt_s32(v, x.v); } - boolvec_t operator>=(intvec const& x) const { return vcge_s32(v, x.v); } - }; - - - - template<> - struct realvec: floatprops - { - static int const size = 2; - typedef real_t scalar_t; - typedef float32x2_t vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return ""; } - void barrier() { __asm__("": "+w" (v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(vdup_n_f32(a)) {} - realvec(real_t const* as) - { - for (int d=0; d mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return vld1_f32(p); - } - static realvec_t loadu(real_t const* p) - { -#if defined __ARM_FEATURE_UNALIGNED - return vld1_f32(p); -#else -# error "unaligned NEON loads not implemented" -#endif - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - vst1_f32(p, v); - } - void storeu(real_t* p) const - { - // Vector stores would require vector loads, which would need to - // be atomic - // p[0] = (*this)[0]; - // p[1] = (*this)[1]; -#if defined __ARM_FEATURE_UNALIGNED - vst1_f32(p, v); -#else -# error "unaligned NEON stores not implemented" -#endif - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return vreinterpret_s32_f32(v); } - intvec_t convert_int() const { return vcvt_s32_f32(v); } - - - - realvec operator+() const { return *this; } - realvec operator-() const { return vneg_f32(v); } - - realvec operator+(realvec x) const { return vadd_f32(v, x.v); } - realvec operator-(realvec x) const { return vsub_f32(v, x.v); } - realvec operator*(realvec x) const { return vmul_f32(v, x.v); } - realvec operator/(realvec x) const { return *this * x.rcp(); } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t prod() const - { - return (*this)[0] * (*this)[1]; - } - real_t sum() const - { - realvec r = vpadd_f32(v, v); - return r[0]; - } - - - - boolvec_t operator==(realvec const& x) const { return vceq_f32(v, x.v); } - boolvec_t operator!=(realvec const& x) const { return !(*this == x); } - boolvec_t operator<(realvec const& x) const { return vclt_f32(v, x.v); } - boolvec_t operator<=(realvec const& x) const { return vcle_f32(v, x.v); } - boolvec_t operator>(realvec const& x) const { return vcgt_f32(v, x.v); } - boolvec_t operator>=(realvec const& x) const { return vcge_f32(v, x.v); } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const - { - // return vrndp_f32(v); - return MF::vml_ceil(*this); - } - realvec copysign(realvec y) const - { - return vbsl_f32(vdup_n_u32(FP::signbit_mask), y.v, v); - } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return vabs_f32(v); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const - { - // return vrndm_f32(v); - return MF::vml_floor(*this); - } - realvec fma(realvec y, realvec z) const - { - // TODO: vfma_f32 - return vmla_f32(z.v, v, y.v); - } - realvec fmax(realvec y) const { return vmax_f32(v, y.v); } - realvec fmin(realvec y) const { return vmin_f32(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return MF::vml_isnan(*this); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const - { - realvec r = vrecpe_f32(v); - r *= vrecps_f32(v, r); - r *= vrecps_f32(v, r); - return r; - } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const - { - // return vrndn_f32(v); - return MF::vml_rint(*this); - } - realvec round() const - { - // return vrnda_f32(v); - return MF::vml_round(*this); - } - realvec rsqrt() const - { - realvec r = vrsqrte_f32(v); - r *= vrsqrts_f32(v, r*r); - r *= vrsqrts_f32(v, r*r); - return r; - } - boolvec_t signbit() const { return MF::vml_signbit(*this); } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - realvec sqrt() const { return *this * rsqrt(); } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const - { - // return vrnd_f32(v); - return MF::vml_trunc(*this); - } - }; - - - - // boolvec definitions - - inline - auto boolvec::as_int() const -> intvec_t - { - return vreinterpret_s32_u32(v); - } - - inline - auto boolvec::convert_int() const -> intvec_t - { - return - as_int(); - } - - inline - auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t - { - return vbsl_s32(v, x.v, y.v); - } - - inline - auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t - { - return vbsl_f32(v, x.v, y.v); - } - - - - // intvec definitions - - inline auto intvec::as_float() const -> realvec_t - { - return vreinterpret_f32_s32(v); - } - - inline auto intvec::convert_float() const -> realvec_t - { - return vcvt_f32_s32(v); - } - -} // namespace vecmathlib - -#endif // #ifndef VEC_FLOAT_NEON_H diff --git a/vec_float_sse2.h b/vec_float_sse2.h deleted file mode 100644 index c87ae02..0000000 --- a/vec_float_sse2.h +++ /dev/null @@ -1,651 +0,0 @@ -// -*-C++-*- - -#ifndef VEC_FLOAT_SSE2_H -#define VEC_FLOAT_SSE2_H - -#include "floatprops.h" -#include "mathfuncs.h" -#include "vec_base.h" - -#include - -// SSE2 intrinsics -#include -#ifdef __SSE3__ // Intel's SSE 3 -# include -#endif -#if defined __SSE4_1__ // Intel's SSE 4.1 -# include -#endif -#if defined __SSE4A__ // AMD's SSE 4a -# include -#endif -#if defined __AVX__ // Intel's AVX -# include -#endif - - - -namespace vecmathlib { - -#define VECMATHLIB_HAVE_VEC_FLOAT_4 - template<> struct boolvec; - template<> struct intvec; - template<> struct realvec; - - - - template<> - struct boolvec: floatprops - { - static int const size = 4; - typedef bool scalar_t; - typedef __m128 bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values have the sign bit set, false values have it unset - static uint_t from_bool(bool a) { return - int_t(a); } - static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } - public: - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): - v(_mm_castsi128_ps(_mm_set1_epi32(from_bool(a)))) {} - boolvec(bool const* as): - v(_mm_castsi128_ps(_mm_set_epi32(from_bool(as[3]), - from_bool(as[2]), - from_bool(as[1]), - from_bool(as[0])))) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } - boolvec& set_elt(int n, bool a) - { - return ((uint_t*)&v)[n]=from_bool(a), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return _mm_xor_ps(boolvec(true), v); } - - boolvec operator&&(boolvec x) const { return _mm_and_ps(v, x.v); } - boolvec operator||(boolvec x) const { return _mm_or_ps(v, x.v); } - boolvec operator==(boolvec x) const { return !(*this!=x); } - boolvec operator!=(boolvec x) const { return _mm_xor_ps(v, x.v); } - - bool all() const - { - // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3]; -#if defined __AVX__ - return ! (! *this).any(); -#else - boolvec x = *this; - x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(1,0,3,2)); - x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1)); - return x[0]; -#endif - } - bool any() const - { - // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3]; -#if defined __AVX__ - return ! _mm_testz_ps(v, v); -#else - boolvec x = *this; - x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(1,0,3,2)); - x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1)); - return x[0]; -#endif - } - - - - // ifthen(condition, then-value, else-value) - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec: floatprops - { - static int const size = 4; - typedef int_t scalar_t; - typedef __m128i ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm_set1_epi32(a)) {} - intvec(int_t const* as): v(_mm_set_epi32(as[3], as[2], as[1], as[0])) {} - static intvec iota() { return _mm_set_epi32(3, 2, 1, 0); } - - operator ivector_t() const { return v; } - int_t operator[](int n) const { return ((int_t const*)&v)[n]; } - intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } - - - - boolvec_t as_bool() const { return _mm_castsi128_ps(v); } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true - return ! IV(_mm_cmpeq_epi32(v, IV(0))).as_bool(); - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec operator+() const { return *this; } - intvec operator-() const { return IV(0) - *this; } - - intvec operator+(intvec x) const { return _mm_add_epi32(v, x.v); } - intvec operator-(intvec x) const { return _mm_sub_epi32(v, x.v); } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - - - - intvec operator~() const { return IV(~U(0)) ^ *this; } - - intvec operator&(intvec x) const - { - return _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), - _mm_castsi128_ps(x.v))); - } - intvec operator|(intvec x) const - { - return _mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(v), - _mm_castsi128_ps(x.v))); - } - intvec operator^(intvec x) const - { - return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(v), - _mm_castsi128_ps(x.v))); - } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - - - intvec lsr(int_t n) const { return _mm_srli_epi32(v, n); } - intvec operator>>(int_t n) const { return _mm_srai_epi32(v, n); } - intvec operator<<(int_t n) const { return _mm_slli_epi32(v, n); } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); - } - return r; - } - intvec operator>>(intvec n) const - { - intvec r; - for (int i=0; i> n[i]); - } - return r; - } - intvec operator<<(intvec n) const - { - intvec r; - for (int i=0; i>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this< x); - } - boolvec_t operator>(intvec const& x) const - { - return x < *this; - } - boolvec_t operator>=(intvec const& x) const - { - return ! (*this < x); - } - }; - - - - template<> - struct realvec: floatprops - { - static int const size = 4; - typedef real_t scalar_t; - typedef __m128 vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return ""; } - void barrier() { __asm__("": "+x" (v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm_set1_ps(a)) {} - realvec(real_t const* as): v(_mm_set_ps(as[3], as[2], as[1], as[0])) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const { return ((real_t const*)&v)[n]; } - realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } - - - - typedef vecmathlib::mask_t mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm_load_ps(p); - } - static realvec_t loadu(real_t const* p) - { - return _mm_loadu_ps(p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - if (ioff==0) return loada(p); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm_store_ps(p, v); - } - void storeu(real_t* p) const - { - return _mm_storeu_ps(p, v); - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { -#if defined __AVX__ - _mm_maskstore_ps(p, m.m.as_int(), v); -#else - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; -#endif - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } else { - if (m.m[0]) p[0] = (*this)[0]; - if (m.m[1]) p[1] = (*this)[1]; - if (m.m[2]) p[2] = (*this)[2]; - if (m.m[3]) p[3] = (*this)[3]; - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff, m); - storeu(p+ioff, m); - } - - - - intvec_t as_int() const { return _mm_castps_si128(v); } - intvec_t convert_int() const { return _mm_cvttps_epi32(v); } - - - - realvec operator+() const { return *this; } - realvec operator-() const { return RV(0.0) - *this; } - - realvec operator+(realvec x) const { return _mm_add_ps(v, x.v); } - realvec operator-(realvec x) const { return _mm_sub_ps(v, x.v); } - realvec operator*(realvec x) const { return _mm_mul_ps(v, x.v); } - realvec operator/(realvec x) const { return _mm_div_ps(v, x.v); } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t prod() const - { - return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; - } - real_t sum() const - { -#ifdef __SSE3__ - realvec x = *this; - x = _mm_hadd_ps(x.v, x.v); - x = _mm_hadd_ps(x.v, x.v); - return x[0]; -#else - return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; -#endif - } - - - - boolvec_t operator==(realvec const& x) const - { - return _mm_cmpeq_ps(v, x.v); - } - boolvec_t operator!=(realvec const& x) const - { - return _mm_cmpneq_ps(v, x.v); - } - boolvec_t operator<(realvec const& x) const - { - return _mm_cmplt_ps(v, x.v); - } - boolvec_t operator<=(realvec const& x) const - { - return _mm_cmple_ps(v, x.v); - } - boolvec_t operator>(realvec const& x) const - { - return _mm_cmpgt_ps(v, x.v); - } - boolvec_t operator>=(realvec const& x) const - { - return _mm_cmpge_ps(v, x.v); - } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const - { -#ifdef __SSE4_1__ - return _mm_ceil_ps(v); -#else - return MF::vml_ceil(*this); -#endif - } - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return MF::vml_fabs(*this); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const - { -#ifdef __SSE4_1__ - return _mm_floor_ps(v); -#else - return MF::vml_floor(*this); -#endif - } - realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); } - realvec fmax(realvec y) const { return _mm_max_ps(v, y.v); } - realvec fmin(realvec y) const { return _mm_min_ps(v, y.v); } - realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return _mm_cmpunord_ps(v, v);; } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const - { - realvec x = *this; - realvec r = _mm_rcp_ps(x); // this is only an approximation - r *= RV(2.0) - r*x; // one Newton iteration (see vml_rcp) - return r; - } - realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } - realvec rint() const - { -#ifdef __SSE4_1__ - return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT); -#else - return MF::vml_rint(*this); -#endif - } - realvec round() const { return MF::vml_round(*this); } - realvec rsqrt() const - { - realvec x = *this; - realvec r = _mm_rsqrt_ps(x); // this is only an approximation - r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt) - return r; - } - boolvec_t signbit() const { return v; } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - realvec sqrt() const { return _mm_sqrt_ps(v); } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const - { -#ifdef __SSE4_1__ - return _mm_round_ps(v, _MM_FROUND_TO_ZERO); -#else - return MF::vml_trunc(*this); -#endif - } - }; - - - - // boolvec definitions - - inline - auto boolvec::as_int() const -> intvec_t - { - return _mm_castps_si128(v); - } - - inline - auto boolvec::convert_int() const -> intvec_t - { - return lsr(as_int(), bits-1); - } - - inline - auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t - { - return ifthen(x.as_float(), y.as_float()).as_int(); - } - - inline - auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t - { -#ifdef __SSE4_1__ - return _mm_blendv_ps(y.v, x.v, v); -#else - return (( -convert_int() & x.as_int()) | - (~-convert_int() & y.as_int())).as_float(); -#endif - } - - - - // intvec definitions - - inline auto intvec::as_float() const -> realvec_t - { - return _mm_castsi128_ps(v); - } - - inline auto intvec::convert_float() const -> realvec_t - { - return _mm_cvtepi32_ps(v); - } - -} // namespace vecmathlib - -#endif // #ifndef VEC_FLOAT_SSE2_H diff --git a/vec_float_sse2_scalar.h b/vec_float_sse2_scalar.h deleted file mode 100644 index 8dcb5b4..0000000 --- a/vec_float_sse2_scalar.h +++ /dev/null @@ -1,523 +0,0 @@ -// -*-C++-*- - -#ifndef VEC_FLOAT_SSE2_SCALAR_H -#define VEC_FLOAT_SSE2_SCALAR_H - -#include "floatprops.h" -#include "mathfuncs.h" -#include "vec_base.h" - -#include - -// SSE2 intrinsics -#include -#ifdef __SSE3__ // Intel's SSE 3 -# include -#endif -#ifdef __SSE4_1__ // Intel's SSE 4.1 -# include -#endif -#ifdef __SSE4A__ // AMD's SSE 4a -# include -#endif -#if defined __AVX__ // Intel's AVX -# include -#endif - - - -namespace vecmathlib { - -#define VECMATHLIB_HAVE_VEC_FLOAT_1 - template<> struct boolvec; - template<> struct intvec; - template<> struct realvec; - - - - template<> - struct boolvec: floatprops - { - static int const size = 1; - typedef bool scalar_t; - typedef uint_t bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - // true values are non-zero, false values are zero - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(a) {} - // TODO: remove this - boolvec(int x): v(x) {} - boolvec(bool const* as): v(as[0]) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const { return v; } - boolvec& set_elt(int n, bool a) { return v=a, *this; } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return !v; } - - boolvec operator&&(boolvec x) const { return v && x.v; } - boolvec operator||(boolvec x) const { return v || x.v; } - boolvec operator==(boolvec x) const { return bool(v) == bool(x.v); } - boolvec operator!=(boolvec x) const { return bool(v) != bool(x.v); } - - bool all() const { return v; } - bool any() const { return v; } - - - - // ifthen(condition, then-value, else-value) - intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec - realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec - }; - - - - template<> - struct intvec: floatprops - { - static int const size = 1; - typedef int_t scalar_t; - typedef int_t ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(int_t a): v(a) {} - intvec(int_t const* as): v(as[0]) {} - static intvec iota() { return intvec(I(0)); } - - operator ivector_t() const { return v; } - int_t operator[](int n) const { return v; } - intvec& set_elt(int n, int_t a) { return v=a, *this; } - - - - boolvec_t as_bool() const { return U(v); } - boolvec_t convert_bool() const { return bool(v); } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - intvec operator+() const { return +v; } - intvec operator-() const { return -v; } - - intvec operator+(intvec x) const { return v+x.v; } - intvec operator-(intvec x) const { return v-x.v; } - intvec operator*(intvec x) const { return v*x.v; } - intvec operator/(intvec x) const { return v/x.v; } - intvec operator%(intvec x) const { return v%x.v; } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - intvec& operator*=(intvec const& x) { return *this=*this*x; } - intvec& operator/=(intvec const& x) { return *this=*this/x; } - intvec& operator%=(intvec const& x) { return *this=*this%x; } - - - - intvec operator~() const { return ~v; } - - intvec operator&(intvec x) const { return v&x.v; } - intvec operator|(intvec x) const { return v|x.v; } - intvec operator^(intvec x) const { return v^x.v; } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - - - intvec lsr(int_t n) const { return U(v) >> U(n); } - intvec operator>>(int_t n) const { return v>>n; } - intvec operator<<(int_t n) const { return v<>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<> U(n); } - intvec operator>>(intvec n) const { return v>>n; } - intvec operator<<(intvec n) const { return v<>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this<(intvec const& x) const { return v>x.v; } - boolvec_t operator>=(intvec const& x) const { return v>=x.v; } - }; - - - - template<> - struct realvec: floatprops - { - static int const size = 1; - typedef real_t scalar_t; - typedef float vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return ""; } - void barrier() { __asm__("": "+x" (v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - private: - static __m128 from_float(float a) { return _mm_set_ss(a); } - static float to_float(__m128 a) { return _mm_cvtss_f32(a); } - public: - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(real_t a): v(a) {} - realvec(real_t const* as): v(as[0]) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const { return v; } - realvec& set_elt(int n, real_t a) { return v=a, *this; } - - - - typedef vecmathlib::mask_t mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return *p; - } - static realvec_t loadu(real_t const* p) - { - return *p; - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loada(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return *this; - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return *this; - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return loada(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - *p = v; - } - void storeu(real_t* p) const - { - *p = v; - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storea(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } - } - void storeu(real_t* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - storeu(p); - } - } - void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - storea(p+ioff, m); - } - - - - intvec_t as_int() const { return floatprops::as_int(v); } - intvec_t convert_int() const { - // return floatprops::convert_int(v); - return _mm_cvttss_si32(_mm_set_ss(v)); - } - - - - realvec operator+() const { return +v; } - realvec operator-() const { return -v; } - - realvec operator+(realvec x) const { return v+x.v; } - realvec operator-(realvec x) const { return v-x.v; } - realvec operator*(realvec x) const { return v*x.v; } - realvec operator/(realvec x) const { return v/x.v; } - - realvec& operator+=(realvec const& x) { return *this=*this+x; } - realvec& operator-=(realvec const& x) { return *this=*this-x; } - realvec& operator*=(realvec const& x) { return *this=*this*x; } - realvec& operator/=(realvec const& x) { return *this=*this/x; } - - real_t prod() const { return v; } - real_t sum() const { return v; } - - - - boolvec_t operator==(realvec const& x) const { return v==x.v; } - boolvec_t operator!=(realvec const& x) const { return v!=x.v; } - boolvec_t operator<(realvec const& x) const { return v(realvec const& x) const { return v>x.v; } - boolvec_t operator>=(realvec const& x) const { return v>=x.v; } - - - - realvec acos() const { return MF::vml_acos(*this); } - realvec acosh() const { return MF::vml_acosh(*this); } - realvec asin() const { return MF::vml_asin(*this); } - realvec asinh() const { return MF::vml_asinh(*this); } - realvec atan() const { return MF::vml_atan(*this); } - realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } - realvec atanh() const { return MF::vml_atanh(*this); } - realvec cbrt() const { return MF::vml_cbrt(*this); } - realvec ceil() const - { -#ifdef __SSE4_1__ - return to_float(_mm_ceil_ss(from_float(v), from_float(v))); -#else - return std::ceil(v); -#endif - } - realvec copysign(realvec y) const { return std::copysign(v, y.v); } - realvec cos() const { return MF::vml_cos(*this); } - realvec cosh() const { return MF::vml_cosh(*this); } - realvec exp() const { return MF::vml_exp(*this); } - realvec exp10() const { return MF::vml_exp10(*this); } - realvec exp2() const { return MF::vml_exp2(*this); } - realvec expm1() const { return MF::vml_expm1(*this); } - realvec fabs() const { return std::fabs(v); } - realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } - realvec floor() const - { -#ifdef __SSE4_1__ - return to_float(_mm_floor_ss(from_float(v), from_float(v))); -#else - return std::floor(v); -#endif - } - realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); } - realvec fmax(realvec y) const - { - return to_float(_mm_max_ss(from_float(v), from_float(y.v))); - } - realvec fmin(realvec y) const - { - return to_float(_mm_min_ss(from_float(v), from_float(y.v))); - } - realvec fmod(realvec y) const { return std::fmod(v, y.v); } - realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } - intvec_t ilogb() const - { - int_t r = std::ilogb(v); - if (r == FP_ILOGB0) r = numeric_limits::min(); - else if (r == FP_ILOGBNAN) r = numeric_limits::max(); - return r; - } - boolvec_t isfinite() const { return std::isfinite(v); } - boolvec_t isinf() const { return std::isinf(v); } - boolvec_t isnan() const - { - return _mm_ucomineq_ss(from_float(v), from_float(v)); - } - boolvec_t isnormal() const { return std::isnormal(v); } - realvec ldexp(int_t n) const { return std::ldexp(v, n); } - realvec ldexp(intvec_t n) const { return std::ldexp(v, n); } - realvec log() const { return MF::vml_log(*this); } - realvec log10() const { return MF::vml_log10(*this); } - realvec log1p() const { return MF::vml_log1p(*this); } - realvec log2() const { return MF::vml_log2(*this); } - realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } - realvec pow(realvec y) const { return MF::vml_pow(*this, y); } - realvec rcp() const { return R(1.0)/v; } - realvec remainder(realvec y) const { return std::remainder(v, y.v); } - realvec rint() const - { -#ifdef __SSE4_1__ - return to_float(_mm_round_ss(from_float(v), from_float(v), - _MM_FROUND_TO_NEAREST_INT)); -#else - return MF::vml_rint(*this); -#endif - } - realvec round() const { return MF::vml_round(*this); } - realvec rsqrt() const { return MF::vml_rsqrt(*this); } - boolvec_t signbit() const { return std::signbit(v); } - realvec sin() const { return MF::vml_sin(*this); } - realvec sinh() const { return MF::vml_sinh(*this); } - // realvec sqrt1() const { return std::sqrt(v); } - realvec sqrt() const { return to_float(_mm_sqrt_ss(from_float(v))); } - realvec tan() const { return MF::vml_tan(*this); } - realvec tanh() const { return MF::vml_tanh(*this); } - realvec trunc() const - { -#ifdef __SSE4_1__ - return to_float(_mm_round_ss(from_float(v), from_float(v), - _MM_FROUND_TO_ZERO)); -#else - return MF::vml_trunc(*this); -#endif - } - }; - - - - // boolvec definitions - - inline - auto boolvec::as_int() const -> intvec_t - { - return I(v); - } - - inline - auto boolvec::convert_int() const -> intvec_t - { - return v; - } - - inline - auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t - { - return v ? x : y; - } - - inline - auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t - { - return v ? x : y; - } - - - - // intvec definitions - - inline auto intvec::as_float() const -> realvec_t - { - return FP::as_float(v); - } - - inline auto intvec::convert_float() const -> realvec_t - { - // return FP::convert_float(v); - return _mm_cvtss_f32(_mm_cvtsi32_ss(_mm_setzero_ps(), v)); - } - -} // namespace vecmathlib - -#endif // #ifndef VEC_FLOAT_SSE2_SCALAR_H diff --git a/vec_fp16_avx.h b/vec_fp16_avx.h deleted file mode 100644 index 5dc456c..0000000 --- a/vec_fp16_avx.h +++ /dev/null @@ -1,582 +0,0 @@ -// -*-C++-*- - -#ifndef VEC_FP16_AVX_H -#define VEC_FP16_AVX_H - -#include "floatprops.h" -#include "mathfuncs.h" -#include "vec_base.h" - -#include - -// AVX intrinsics -#include - - - -namespace vecmathlib { - -#define VECMATHLIB_HAVE_VEC_FP16_16 - template<> struct boolvec; - template<> struct intvec; - template<> struct realvec; - - - - template<> - struct boolvec: floatprops - { - static int const size = 16; - typedef bool scalar_t; - typedef __m256i bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values have the sign bit set, false values have it unset - static uint_t from_bool(bool a) { return - uint_t(a); } - static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } - public: - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(_mm256_set1_epi16(from_bool(a))) {} - boolvec(bool const* as): - v(_mm256_set_epi16(from_bool(as[15]), - from_bool(as[14]), - from_bool(as[13]), - from_bool(as[12]), - from_bool(as[11]), - from_bool(as[10]), - from_bool(as[ 9]), - from_bool(as[ 8]), - from_bool(as[ 7]), - from_bool(as[ 6]), - from_bool(as[ 5]), - from_bool(as[ 4]), - from_bool(as[ 3]), - from_bool(as[ 2]), - from_bool(as[ 1]), - from_bool(as[ 0]))) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } - boolvec& set_elt(int n, bool a) - { - return ((uint_t*)&v)[n]=from_bool(a), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return *this != boolvec(true); } - - boolvec operator&&(boolvec x) const - { - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - boolvec operator||(boolvec x) const - { - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - boolvec operator==(boolvec x) const { return !(*this!=x); } - boolvec operator!=(boolvec x) const - { - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - - bool all() const - { - bool r = true; - for (int n=0; n - struct intvec: floatprops - { - static int const size = 16; - typedef int_t scalar_t; - typedef __m256i ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm256_set1_epi16(a)) {} - intvec(int_t const* as): - v(_mm256_set_epi16(as[15], - as[14], - as[13], - as[12], - as[11], - as[10], - as[ 9], - as[ 8], - as[ 7], - as[ 6], - as[ 5], - as[ 4], - as[ 3], - as[ 2], - as[ 1], - as[ 0])) {} - static intvec iota() - { - return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, - 7, 6, 5, 4, 3, 2, 1, 0); - } - - operator ivector_t() const { return v; } - int_t operator[](int n) const { return ((int_t const*)&v)[n]; } - intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } - - - - boolvec_t as_bool() const { return v; } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare with zero. Instead, we check - // whether x is positive and x-1 is negative. - intvec x = *this; - // We know that boolvec values depend only on the sign bit - // return (~(x-1) | x).as_bool(); - // return x.as_bool() || !(x-1).as_bool(); - return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec operator+() const { return *this; } - intvec operator-() const { return IV(I(0)) - *this; } - - intvec operator+(intvec x) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_add_epi16(vlo, xvlo); - vhi = _mm_add_epi16(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec operator-(intvec x) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_sub_epi16(vlo, xvlo); - vhi = _mm_sub_epi16(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - - - - intvec operator~() const { return IV(~U(0)) ^ *this; } - - intvec operator&(intvec x) const - { - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - intvec operator|(intvec x) const - { - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - intvec operator^(intvec x) const - { - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - - - intvec lsr(int_t n) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_srli_epi16(vlo, n); - vhi = _mm_srli_epi16(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec operator>>(int_t n) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_srai_epi16(vlo, n); - vhi = _mm_srai_epi16(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec operator<<(int_t n) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - vlo = _mm_slli_epi16(vlo, n); - vhi = _mm_slli_epi16(vhi, n); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); - } - return r; - } - intvec operator>>(intvec n) const - { - intvec r; - for (int i=0; i> n[i]); - } - return r; - } - intvec operator<<(intvec n) const - { - intvec r; - for (int i=0; i>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this< - struct realvec: floatprops - { - static int const size = 16; - typedef real_t scalar_t; - typedef __m256i vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return ""; } - void barrier() { __asm__("": "+x" (v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm256_set1_epi16(FP::as_int(a))) {} - realvec(real_t const* as): - v(_mm256_set_epi16(FP::as_int(as[15]), - FP::as_int(as[14]), - FP::as_int(as[13]), - FP::as_int(as[12]), - FP::as_int(as[11]), - FP::as_int(as[10]), - FP::as_int(as[ 9]), - FP::as_int(as[ 8]), - FP::as_int(as[ 7]), - FP::as_int(as[ 6]), - FP::as_int(as[ 5]), - FP::as_int(as[ 4]), - FP::as_int(as[ 3]), - FP::as_int(as[ 2]), - FP::as_int(as[ 1]), - FP::as_int(as[ 0]))) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const { return ((real_t const*)&v)[n]; } - realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } - - - - typedef vecmathlib::mask_t mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm256_load_si256((__m256i const*)p); - } - static realvec_t loadu(real_t const* p) - { - return _mm256_loadu_si256((__m256i const*)p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm256_store_si256((__m256i*)p, v); - } - void storeu(real_t* p) const - { - return _mm256_storeu_si256((__m256i*)p, v); - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - // TODO: this is expensive - for (int n=0; n(realvec const& x) const { __builtin_unreachable(); } - boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); } - - - - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec fabs() const { return MF::vml_fabs(*this); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return MF::vml_isnan(*this); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - boolvec_t signbit() const { return v; } - }; - - - - // boolvec definitions - - inline - auto boolvec::as_int() const -> intvec_t - { - return v; - } - - inline - auto boolvec::convert_int() const -> intvec_t - { - return lsr(as_int(), bits-1); - } - - inline - auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t - { - return ifthen(x.as_float(), y.as_float()).as_int(); - } - - inline - auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t - { - return (( -convert_int() & x.as_int()) | - (~-convert_int() & y.as_int())).as_float(); - } - - - - // intvec definitions - - inline auto intvec::as_float() const -> realvec_t - { - return v; - } - - inline auto intvec::convert_float() const -> realvec_t - { - __builtin_unreachable(); - } - -} // namespace vecmathlib - -#endif // #ifndef VEC_FP16_AVX_H diff --git a/vec_fp8_avx.h b/vec_fp8_avx.h deleted file mode 100644 index 16087d1..0000000 --- a/vec_fp8_avx.h +++ /dev/null @@ -1,648 +0,0 @@ -// -*-C++-*- - -#ifndef VEC_FP8_AVX_H -#define VEC_FP8_AVX_H - -#include "floatprops.h" -#include "mathfuncs.h" -#include "vec_base.h" - -#include - -// AVX intrinsics -#include - - - -namespace vecmathlib { - -#define VECMATHLIB_HAVE_VEC_FP8_32 - template<> struct boolvec; - template<> struct intvec; - template<> struct realvec; - - - - template<> - struct boolvec: floatprops - { - static int const size = 32; - typedef bool scalar_t; - typedef __m256i bvector_t; - static int const alignment = sizeof(bvector_t); - - static_assert(size * sizeof(real_t) == sizeof(bvector_t), - "vector size is wrong"); - - private: - // true values have the sign bit set, false values have it unset - static uint_t from_bool(bool a) { return - uint_t(a); } - static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } - public: - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - bvector_t v; - - boolvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // boolvec(boolvec const& x): v(x.v) {} - // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } - boolvec(bvector_t x): v(x) {} - boolvec(bool a): v(_mm256_set1_epi8(from_bool(a))) {} - boolvec(bool const* as): - v(_mm256_set_epi8(from_bool(as[31]), - from_bool(as[30]), - from_bool(as[29]), - from_bool(as[28]), - from_bool(as[27]), - from_bool(as[26]), - from_bool(as[25]), - from_bool(as[24]), - from_bool(as[23]), - from_bool(as[22]), - from_bool(as[21]), - from_bool(as[20]), - from_bool(as[19]), - from_bool(as[18]), - from_bool(as[17]), - from_bool(as[16]), - from_bool(as[15]), - from_bool(as[14]), - from_bool(as[13]), - from_bool(as[12]), - from_bool(as[11]), - from_bool(as[10]), - from_bool(as[ 9]), - from_bool(as[ 8]), - from_bool(as[ 7]), - from_bool(as[ 6]), - from_bool(as[ 5]), - from_bool(as[ 4]), - from_bool(as[ 3]), - from_bool(as[ 2]), - from_bool(as[ 1]), - from_bool(as[ 0]))) {} - - operator bvector_t() const { return v; } - bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } - boolvec& set_elt(int n, bool a) - { - return ((uint_t*)&v)[n]=from_bool(a), *this; - } - - - - intvec_t as_int() const; // defined after intvec - intvec_t convert_int() const; // defined after intvec - - - - boolvec operator!() const { return *this != boolvec(true); } - - boolvec operator&&(boolvec x) const - { - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - boolvec operator||(boolvec x) const - { - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - boolvec operator==(boolvec x) const { return !(*this!=x); } - boolvec operator!=(boolvec x) const - { - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - - bool all() const - { - bool r = true; - for (int n=0; n - struct intvec: floatprops - { - static int const size = 32; - typedef int_t scalar_t; - typedef __m256i ivector_t; - static int const alignment = sizeof(ivector_t); - - static_assert(size * sizeof(real_t) == sizeof(ivector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - ivector_t v; - - intvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // intvec(intvec const& x): v(x.v) {} - // intvec& operator=(intvec const& x) { return v=x.v, *this; } - intvec(ivector_t x): v(x) {} - intvec(int_t a): v(_mm256_set1_epi8(a)) {} - intvec(int_t const* as): - v(_mm256_set_epi8(as[31], - as[30], - as[29], - as[28], - as[27], - as[26], - as[25], - as[24], - as[23], - as[22], - as[21], - as[20], - as[19], - as[18], - as[17], - as[16], - as[15], - as[14], - as[13], - as[12], - as[11], - as[10], - as[ 9], - as[ 8], - as[ 7], - as[ 6], - as[ 5], - as[ 4], - as[ 3], - as[ 2], - as[ 1], - as[ 0])) {} - static intvec iota() - { - return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, - 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, 13, 12, 11, 10, 9, 8, - 7, 6, 5, 4, 3, 2, 1, 0); - } - - operator ivector_t() const { return v; } - int_t operator[](int n) const { return ((int_t const*)&v)[n]; } - intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } - - - - boolvec_t as_bool() const { return v; } - boolvec_t convert_bool() const - { - // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare with zero. Instead, we check - // whether x is positive and x-1 is negative. - intvec x = *this; - // We know that boolvec values depend only on the sign bit - // return (~(x-1) | x).as_bool(); - // return x.as_bool() || !(x-1).as_bool(); - return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); - } - realvec_t as_float() const; // defined after realvec - realvec_t convert_float() const; // defined after realvec - - - - // Note: not all arithmetic operations are supported! - - intvec operator+() const { return *this; } - intvec operator-() const { return IV(I(0)) - *this; } - - intvec operator+(intvec x) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_add_epi8(vlo, xvlo); - vhi = _mm_add_epi8(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec operator-(intvec x) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - __m128i xvlo = _mm256_castsi256_si128(x.v); - __m128i xvhi = _mm256_extractf128_si256(x.v, 1); - vlo = _mm_sub_epi8(vlo, xvlo); - vhi = _mm_sub_epi8(vhi, xvhi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - - intvec& operator+=(intvec const& x) { return *this=*this+x; } - intvec& operator-=(intvec const& x) { return *this=*this-x; } - - - - intvec operator~() const { return IV(~U(0)) ^ *this; } - - intvec operator&(intvec x) const - { - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - intvec operator|(intvec x) const - { - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - intvec operator^(intvec x) const - { - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), - _mm256_castsi256_ps(x.v))); - } - - intvec& operator&=(intvec const& x) { return *this=*this&x; } - intvec& operator|=(intvec const& x) { return *this=*this|x; } - intvec& operator^=(intvec const& x) { return *this=*this^x; } - - - - intvec lsr(int_t n) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - uint_t masklo = U(0x00ffU) >> U(n); - uint_t maskhi = U(0xff00U); - __m128i mask = _mm_set1_epi16(masklo | maskhi); - vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask); - vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec operator>>(int_t n) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - uint_t masklo = U(0x00ffU); - uint_t maskhi = U(0xff00U); - __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n+8), - _mm_set1_epi16(masklo)); - __m128i vlohi = _mm_and_si128(_mm_srai_epi16(vlo, n), - _mm_set1_epi16(maskhi)); - vlo = _mm_or_si128(vlolo, vlohi); - __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n+8), - _mm_set1_epi16(masklo)); - __m128i vhihi = _mm_and_si128(_mm_srai_epi16(vhi, n), - _mm_set1_epi16(maskhi)); - vhi = _mm_or_si128(vhilo, vhihi); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec operator<<(int_t n) const - { - __m128i vlo = _mm256_castsi256_si128(v); - __m128i vhi = _mm256_extractf128_si256(v, 1); - uint_t masklo = U(0x00ffU); - uint_t maskhi = U(0xff00U) << U(n); - __m128i mask = _mm_set1_epi16(masklo | maskhi); - vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask); - vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask); - return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); - } - intvec& operator>>=(int_t n) { return *this=*this>>n; } - intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); - } - return r; - } - intvec operator>>(intvec n) const - { - intvec r; - for (int i=0; i> n[i]); - } - return r; - } - intvec operator<<(intvec n) const - { - intvec r; - for (int i=0; i>=(intvec n) { return *this=*this>>n; } - intvec& operator<<=(intvec n) { return *this=*this< - struct realvec: floatprops - { - static int const size = 32; - typedef real_t scalar_t; - typedef __m256i vector_t; - static int const alignment = sizeof(vector_t); - - static char const* name() { return ""; } - void barrier() { __asm__("": "+x" (v)); } - - static_assert(size * sizeof(real_t) == sizeof(vector_t), - "vector size is wrong"); - - typedef boolvec boolvec_t; - typedef intvec intvec_t; - typedef realvec realvec_t; - - // Short names for type casts - typedef real_t R; - typedef int_t I; - typedef uint_t U; - typedef realvec_t RV; - typedef intvec_t IV; - typedef boolvec_t BV; - typedef floatprops FP; - typedef mathfuncs MF; - - - - vector_t v; - - realvec() {} - // Can't have a non-trivial copy constructor; if so, objects won't - // be passed in registers - // realvec(realvec const& x): v(x.v) {} - // realvec& operator=(realvec const& x) { return v=x.v, *this; } - realvec(vector_t x): v(x) {} - realvec(real_t a): v(_mm256_set1_epi8(FP::as_int(a))) {} - realvec(real_t const* as): - v(_mm256_set_epi8(FP::as_int(as[31]), - FP::as_int(as[30]), - FP::as_int(as[29]), - FP::as_int(as[28]), - FP::as_int(as[27]), - FP::as_int(as[26]), - FP::as_int(as[25]), - FP::as_int(as[24]), - FP::as_int(as[23]), - FP::as_int(as[22]), - FP::as_int(as[21]), - FP::as_int(as[20]), - FP::as_int(as[19]), - FP::as_int(as[18]), - FP::as_int(as[17]), - FP::as_int(as[16]), - FP::as_int(as[15]), - FP::as_int(as[14]), - FP::as_int(as[13]), - FP::as_int(as[12]), - FP::as_int(as[11]), - FP::as_int(as[10]), - FP::as_int(as[ 9]), - FP::as_int(as[ 8]), - FP::as_int(as[ 7]), - FP::as_int(as[ 6]), - FP::as_int(as[ 5]), - FP::as_int(as[ 4]), - FP::as_int(as[ 3]), - FP::as_int(as[ 2]), - FP::as_int(as[ 1]), - FP::as_int(as[ 0]))) {} - - operator vector_t() const { return v; } - real_t operator[](int n) const { return ((real_t const*)&v)[n]; } - realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } - - - - typedef vecmathlib::mask_t mask_t; - - static realvec_t loada(real_t const* p) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - return _mm256_load_si256((__m256i const*)p); - } - static realvec_t loadu(real_t const* p) - { - return _mm256_loadu_si256((__m256i const*)p); - } - static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff); - return loadu(p+ioff); - } - realvec_t loada(real_t const* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(all(m.m), true)) { - return loada(p); - } else { - return m.m.ifthen(loada(p), *this); - } - } - realvec_t loadu(real_t const* p, mask_t const& m) const - { - if (__builtin_expect(m.all_m, true)) { - return loadu(p); - } else { - return m.m.ifthen(loadu(p), *this); - } - } - realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return loada(p+ioff, m); - return loadu(p+ioff, m); - } - - void storea(real_t* p) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - _mm256_store_si256((__m256i*)p, v); - } - void storeu(real_t* p) const - { - return _mm256_storeu_si256((__m256i*)p, v); - } - void storeu(real_t* p, std::ptrdiff_t ioff) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (ioff % realvec::size == 0) return storea(p+ioff); - storeu(p+ioff); - } - void storea(real_t* p, mask_t const& m) const - { - VML_ASSERT(intptr_t(p) % alignment == 0); - if (__builtin_expect(m.all_m, true)) { - storea(p); - } else { - // TODO: this is expensive - for (int n=0; n(realvec const& x) const { __builtin_unreachable(); } - boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); } - - - - realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } - realvec fabs() const { return MF::vml_fabs(*this); } - intvec_t ilogb() const { return MF::vml_ilogb(*this); } - boolvec_t isfinite() const { return MF::vml_isfinite(*this); } - boolvec_t isinf() const { return MF::vml_isinf(*this); } - boolvec_t isnan() const { return MF::vml_isnan(*this); } - boolvec_t isnormal() const { return MF::vml_isnormal(*this); } - realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } - realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } - boolvec_t signbit() const { return v; } - }; - - - - // boolvec definitions - - inline - auto boolvec::as_int() const -> intvec_t - { - return v; - } - - inline - auto boolvec::convert_int() const -> intvec_t - { - return lsr(as_int(), bits-1); - } - - inline - auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t - { - return ifthen(x.as_float(), y.as_float()).as_int(); - } - - inline - auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t - { - return (( -convert_int() & x.as_int()) | - (~-convert_int() & y.as_int())).as_float(); - } - - - - // intvec definitions - - inline auto intvec::as_float() const -> realvec_t - { - return v; - } - - inline auto intvec::convert_float() const -> realvec_t - { - __builtin_unreachable(); - } - -} // namespace vecmathlib - -#endif // #ifndef VEC_FP8_AVX_H diff --git a/vec_neon_float2.h b/vec_neon_float2.h new file mode 100644 index 0000000..258a091 --- /dev/null +++ b/vec_neon_float2.h @@ -0,0 +1,558 @@ +// -*-C++-*- + +// + +#ifndef VEC_NEON_FLOAT2_H +#define VEC_NEON_FLOAT2_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include + +// Neon intrinsics +#include + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_FLOAT_2 + template<> struct boolvec; + template<> struct intvec; + template<> struct realvec; + + + + template<> + struct boolvec: floatprops + { + static int const size = 2; + typedef bool scalar_t; + typedef uint32x2_t bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + private: + // true values are -1, false values are 0 + static uint_t from_bool(bool a) { return -int_t(a); } + static bool to_bool(uint_t a) { return a; } + public: + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): v(vdup_n_u32(from_bool(a))) {} + boolvec(bool const* as) + { + for (int d=0; d + struct intvec: floatprops + { + static int const size = 2; + typedef int_t scalar_t; + typedef int32x2_t ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x): v(x) {} + intvec(int_t a): v(vdup_n_s32(a)) {} + intvec(int_t const* as) + { + for (int d=0; d>(int_t n) const { return *this >> IV(n); } + intvec operator<<(int_t n) const { return *this << IV(n); } + intvec& operator>>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<>(intvec n) const + { + return vshl_s32(v, (-n).v); + } + intvec operator<<(intvec n) const + { + return vshl_s32(v, n.v); + } + intvec& operator>>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this<(intvec const& x) const { return vcgt_s32(v, x.v); } + boolvec_t operator>=(intvec const& x) const { return vcge_s32(v, x.v); } + }; + + + + template<> + struct realvec: floatprops + { + static int const size = 2; + typedef real_t scalar_t; + typedef float32x2_t vector_t; + static int const alignment = sizeof(vector_t); + + static char const* name() { return ""; } + void barrier() { __asm__("": "+w" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x): v(x) {} + realvec(real_t a): v(vdup_n_f32(a)) {} + realvec(real_t const* as) + { + for (int d=0; d mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return vld1_f32(p); + } + static realvec_t loadu(real_t const* p) + { +#if defined __ARM_FEATURE_UNALIGNED + return vld1_f32(p); +#else +# error "unaligned NEON loads not implemented" +#endif + } + static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + vst1_f32(p, v); + } + void storeu(real_t* p) const + { + // Vector stores would require vector loads, which would need to + // be atomic + // p[0] = (*this)[0]; + // p[1] = (*this)[1]; +#if defined __ARM_FEATURE_UNALIGNED + vst1_f32(p, v); +#else +# error "unaligned NEON stores not implemented" +#endif + } + void storeu(real_t* p, std::ptrdiff_t ioff) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + } + } + void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff, m); + storeu(p+ioff, m); + } + + + + intvec_t as_int() const { return vreinterpret_s32_f32(v); } + intvec_t convert_int() const { return vcvt_s32_f32(v); } + + + + realvec operator+() const { return *this; } + realvec operator-() const { return vneg_f32(v); } + + realvec operator+(realvec x) const { return vadd_f32(v, x.v); } + realvec operator-(realvec x) const { return vsub_f32(v, x.v); } + realvec operator*(realvec x) const { return vmul_f32(v, x.v); } + realvec operator/(realvec x) const { return *this * x.rcp(); } + + realvec& operator+=(realvec const& x) { return *this=*this+x; } + realvec& operator-=(realvec const& x) { return *this=*this-x; } + realvec& operator*=(realvec const& x) { return *this=*this*x; } + realvec& operator/=(realvec const& x) { return *this=*this/x; } + + real_t prod() const + { + return (*this)[0] * (*this)[1]; + } + real_t sum() const + { + realvec r = vpadd_f32(v, v); + return r[0]; + } + + + + boolvec_t operator==(realvec const& x) const { return vceq_f32(v, x.v); } + boolvec_t operator!=(realvec const& x) const { return !(*this == x); } + boolvec_t operator<(realvec const& x) const { return vclt_f32(v, x.v); } + boolvec_t operator<=(realvec const& x) const { return vcle_f32(v, x.v); } + boolvec_t operator>(realvec const& x) const { return vcgt_f32(v, x.v); } + boolvec_t operator>=(realvec const& x) const { return vcge_f32(v, x.v); } + + + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const + { + // return vrndp_f32(v); + return MF::vml_ceil(*this); + } + realvec copysign(realvec y) const + { + return vbsl_f32(vdup_n_u32(FP::signbit_mask), y.v, v); + } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return vabs_f32(v); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const + { + // return vrndm_f32(v); + return MF::vml_floor(*this); + } + realvec fma(realvec y, realvec z) const + { + // TODO: vfma_f32 + return vmla_f32(z.v, v, y.v); + } + realvec fmax(realvec y) const { return vmax_f32(v, y.v); } + realvec fmin(realvec y) const { return vmin_f32(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const + { + realvec r = vrecpe_f32(v); + r *= vrecps_f32(v, r); + r *= vrecps_f32(v, r); + return r; + } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const + { + // return vrndn_f32(v); + return MF::vml_rint(*this); + } + realvec round() const + { + // return vrnda_f32(v); + return MF::vml_round(*this); + } + realvec rsqrt() const + { + realvec r = vrsqrte_f32(v); + r *= vrsqrts_f32(v, r*r); + r *= vrsqrts_f32(v, r*r); + return r; + } + boolvec_t signbit() const { return MF::vml_signbit(*this); } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + realvec sqrt() const { return *this * rsqrt(); } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const + { + // return vrnd_f32(v); + return MF::vml_trunc(*this); + } + }; + + + + // boolvec definitions + + inline + auto boolvec::as_int() const -> intvec_t + { + return vreinterpret_s32_u32(v); + } + + inline + auto boolvec::convert_int() const -> intvec_t + { + return - as_int(); + } + + inline + auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return vbsl_s32(v, x.v, y.v); + } + + inline + auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { + return vbsl_f32(v, x.v, y.v); + } + + + + // intvec definitions + + inline auto intvec::as_float() const -> realvec_t + { + return vreinterpret_f32_s32(v); + } + + inline auto intvec::convert_float() const -> realvec_t + { + return vcvt_f32_s32(v); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_NEON_FLOAT2_H diff --git a/vec_qpx_double4.h b/vec_qpx_double4.h new file mode 100644 index 0000000..e7a1c05 --- /dev/null +++ b/vec_qpx_double4.h @@ -0,0 +1,667 @@ +// -*-C++-*- + +#ifndef VEC_QPX_DOUBLE4_H +#define VEC_QPX_DOUBLE4_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include +#warning "TODO" +#include + +// QPX intrinsics +#ifdef __clang__ +# include +#else +# include +#endif +#include + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_DOUBLE_4 + template<> struct boolvec; + template<> struct intvec; + template<> struct realvec; + + + + template<> + struct boolvec: floatprops + { + static int const size = 4; + typedef bool scalar_t; + typedef vector4double bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + private: + // canonical true is +1.0, canonical false is -1.0 + // >=0 is true, -0 is true, nan is false + static real_t from_bool(bool a) { return a ? +1.0 : -1.0; } + static bool to_bool(real_t a) { return a>=0.0; } + public: + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): v(vec_splats(from_bool(a))) {} + boolvec(bool const* as) + { + for (int d=0; d + struct intvec: floatprops + { + static int const size = 4; + typedef int_t scalar_t; + typedef vector4double ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x): v(x) {} + intvec(int_t a): v(vec_splats(FP::as_float(a))) {} + intvec(int_t const* as) + { + for (int d=0; d> U(n)); + return r; + } + intvec operator>>(int_t n) const + { + intvec r; + for (int d=0; d> n); + return r; + } + intvec operator<<(int_t n) const + { + intvec r; + for (int d=0; d>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<> U(n[d])); + return r; + } + intvec operator>>(intvec n) const + { + intvec r; + for (int d=0; d> n[d]); + return r; + } + intvec operator<<(intvec n) const + { + intvec r; + for (int d=0; d>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this<(intvec const& x) const + { + boolvec_t r; + for (int d=0; d x[d]); + return r; + } + boolvec_t operator>=(intvec const& x) const + { + boolvec_t r; + for (int d=0; d= x[d]); + return r; + } + }; + + + + template<> + struct realvec: floatprops + { + static int const size = 4; + typedef real_t scalar_t; + typedef vector4double vector_t; + static int const alignment = sizeof(vector_t); + + static char const* name() { return ""; } + void barrier() { __asm__("": "+v" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x): v(x) {} + realvec(real_t a): v(vec_splats(a)) {} + realvec(real_t const* as) + { + for (int d=0; d mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return vec_lda(0, (real_t*)p); + } + static realvec_t loadu(real_t const* p) + { + realvec_t v0 = vec_ld(0, (real_t*)p); + realvec_t v1 = vec_ld(31, (real_t*)p); + return vec_perm(v0.v, v1.v, vec_lvsl(0, (real_t*)p)); + } + static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff); + // TODO: use load instruction with fixed offset + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff, m); + // TODO: use load instruction with fixed offset + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); +#warning "TODO" + std::cout << "yes this is storea\n"; + vec_sta(v, 0, p); + } + void storeu(real_t* p) const + { + // Vector stores would require vector loads, which would need to + // be atomic + // TODO: see for good ideas + p[0] = (*this)[0]; + p[1] = (*this)[1]; + p[2] = (*this)[2]; + p[3] = (*this)[3]; + } + void storeu(real_t* p, std::ptrdiff_t ioff) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + if (m.m[2]) p[2] = (*this)[2]; + if (m.m[3]) p[3] = (*this)[3]; + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + if (m.m[2]) p[2] = (*this)[2]; + if (m.m[3]) p[3] = (*this)[3]; + } + } + void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff, m); + storeu(p+ioff, m); + } + + + + intvec_t as_int() const { return v; } + intvec_t convert_int() const { return vec_ctid(v); } + + + + realvec operator+() const { return *this; } + realvec operator-() const { return vec_neg(v); } + + realvec operator+(realvec x) const { return vec_add(v, x.v); } + realvec operator-(realvec x) const { return vec_sub(v, x.v); } + realvec operator*(realvec x) const { return vec_mul(v, x.v); } + realvec operator/(realvec x) const + { + // return vec_swdiv_nochk(v, x.v); + return div_fastd4(v, x.v); + } + + realvec& operator+=(realvec const& x) { return *this=*this+x; } + realvec& operator-=(realvec const& x) { return *this=*this-x; } + realvec& operator*=(realvec const& x) { return *this=*this*x; } + realvec& operator/=(realvec const& x) { return *this=*this/x; } + + real_t prod() const + { + return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; + } + real_t sum() const + { + return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; + } + + + + boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); } + boolvec_t operator!=(realvec const& x) const { return ! (*this == x); } + boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); } + boolvec_t operator<=(realvec const& x) const { return ! (*this > x); } + boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); } + boolvec_t operator>=(realvec const& x) const { return ! (*this < x); } + + + + realvec acos() const { return acosd4(v); } + realvec acosh() const { return acoshd4(v); } + realvec asin() const { return asind4(v); } + realvec asinh() const { return asinhd4(v); } + realvec atan() const { return atand4(v); } + realvec atan2(realvec y) const { return atan2d4(v, y.v); } + realvec atanh() const { return atanhd4(v); } + realvec cbrt() const { return cbrtd4(v); } + realvec ceil() const { return vec_ceil(v); } + realvec copysign(realvec y) const { return vec_cpsgn(v, y.v); } + realvec cos() const { return cosd4(v); } + realvec cosh() const { return coshd4(v); } + realvec exp() const { return expd4(v); } + realvec exp10() const { return exp10d4(v); } + realvec exp2() const { return exp2d4(v); } + realvec expm1() const { return expm1d4(v); } + realvec fabs() const { return vec_abs(v); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const { return vec_floor(v); } + realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); } + realvec fmax(realvec y) const { return MF::vml_fmax(v, y.v); } + realvec fmin(realvec y) const { return MF::vml_fmin(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec hypot(realvec y) const { return hypotd4(v, y.v); } + intvec_t ilogb() const + { + int_t ilogb_[] = { + ::ilogb((*this)[0]), + ::ilogb((*this)[1]), + ::ilogb((*this)[2]), + ::ilogb((*this)[3]) + }; + return intvec_t(ilogb_); + } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return vec_tstnan(v, v); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return ldexp(intvec_t(n)); } + realvec ldexp(intvec_t n) const + { + real_t ldexp_[] = { + std::ldexp((*this)[0], n[0]), + std::ldexp((*this)[1], n[1]), + std::ldexp((*this)[2], n[2]), + std::ldexp((*this)[3], n[3]) + }; + return realvec_t(ldexp_); + } + realvec log() const { return logd4(v); } + realvec log10() const { return log10d4(v); } + realvec log1p() const { return log1pd4(v); } + realvec log2() const { return log2d4(v); } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return powd4(v, y.v); } + realvec rcp() const { return recip_fastd4(v); } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const { return MF::vml_rint(*this); } + realvec round() const { return vec_round(v); } + realvec rsqrt() const + { + realvec x = *this; + realvec r = vec_rsqrte(x.v); // this is only an approximation + // TODO: use fma + // one Newton iteration (see vml_rsqrt) + r += RV(0.5)*r * (RV(1.0) - x * r*r); + return r; + } + boolvec_t signbit() const { return !copysign(RV(1.0)).as_int().as_bool(); } + realvec sin() const { return sind4(v); } + realvec sinh() const { return sinhd4(v); } + realvec sqrt() const + { + // return vec_sqrtsw_nochk(v); + return *this * rsqrt(); + } + realvec tan() const { return tand4(v); } + realvec tanh() const { return tanhd4(v); } + realvec trunc() const { return vec_trunc(v); } + }; + + + + // boolvec definitions + + inline + boolvec::intvec_t boolvec::as_int() const + { + return v; + } + + inline + boolvec::intvec_t boolvec::convert_int() const + { + return ifthen(IV(I(1)), IV(I(0))); + } + + inline + boolvec::intvec_t boolvec::ifthen(intvec_t x, + intvec_t y) const + { + return ifthen(x.as_float(), y.as_float()).as_int(); + } + + inline + boolvec::realvec_t boolvec::ifthen(realvec_t x, + realvec_t y) const + { + return vec_sel(y.v, x.v, v); + } + + + + // intvec definitions + + inline intvec::realvec_t intvec::as_float() const + { + return v; + } + + inline intvec::realvec_t intvec::convert_float() const + { + return vec_cfid(v); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_QPX_DOUBLE4_H diff --git a/vec_sse_double1.h b/vec_sse_double1.h new file mode 100644 index 0000000..ff1145e --- /dev/null +++ b/vec_sse_double1.h @@ -0,0 +1,528 @@ +// -*-C++-*- + +#ifndef VEC_SSE_DOUBLE1_H +#define VEC_SSE_DOUBLE1_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include + +// SSE2 intrinsics +#include +#ifdef __SSE3__ // Intel's SSE 3 +# include +#endif +#ifdef __SSE4_1__ // Intel's SSE 4.1 +# include +#endif +#ifdef __SSE4A__ // AMD's SSE 4a +# include +#endif +#if defined __AVX__ // Intel's AVX +# include +#endif + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_DOUBLE_1 + template<> struct boolvec; + template<> struct intvec; + template<> struct realvec; + + + + template<> + struct boolvec: floatprops + { + static int const size = 1; + typedef bool scalar_t; + typedef uint_t bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + // true values are non-zero, false values are zero + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): v(a) {} + // TODO: remove this + boolvec(int x): v(x) {} + boolvec(bool const* as): v(as[0]) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return v; } + boolvec& set_elt(int n, bool a) { return v=a, *this; } + + + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + + + boolvec operator!() const { return !v; } + + boolvec operator&&(boolvec x) const { return v && x.v; } + boolvec operator||(boolvec x) const { return v || x.v; } + boolvec operator==(boolvec x) const { return bool(v) == bool(x.v); } + boolvec operator!=(boolvec x) const { return bool(v) != bool(x.v); } + + bool all() const { return v; } + bool any() const { return v; } + + + + // ifthen(condition, then-value, else-value) + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec + }; + + + + template<> + struct intvec: floatprops + { + static int const size = 1; + typedef int_t scalar_t; + typedef int_t ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(int_t a): v(a) {} + intvec(int_t const* as): v(as[0]) {} + static intvec iota() { return intvec(I(0)); } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return v; } + intvec& set_elt(int n, int_t a) { return v=a, *this; } + + + + boolvec_t as_bool() const { return U(v); } + boolvec_t convert_bool() const { return bool(v); } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + + + intvec operator+() const { return +v; } + intvec operator-() const { return -v; } + + intvec operator+(intvec x) const { return v+x.v; } + intvec operator-(intvec x) const { return v-x.v; } + intvec operator*(intvec x) const { return v*x.v; } + intvec operator/(intvec x) const { return v/x.v; } + intvec operator%(intvec x) const { return v%x.v; } + + intvec& operator+=(intvec const& x) { return *this=*this+x; } + intvec& operator-=(intvec const& x) { return *this=*this-x; } + intvec& operator*=(intvec const& x) { return *this=*this*x; } + intvec& operator/=(intvec const& x) { return *this=*this/x; } + intvec& operator%=(intvec const& x) { return *this=*this%x; } + + + + intvec operator~() const { return ~v; } + + intvec operator&(intvec x) const { return v&x.v; } + intvec operator|(intvec x) const { return v|x.v; } + intvec operator^(intvec x) const { return v^x.v; } + + intvec& operator&=(intvec const& x) { return *this=*this&x; } + intvec& operator|=(intvec const& x) { return *this=*this|x; } + intvec& operator^=(intvec const& x) { return *this=*this^x; } + + + + intvec lsr(int_t n) const { return U(v) >> U(n); } + intvec operator>>(int_t n) const { return v>>n; } + intvec operator<<(int_t n) const { return v<>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<> U(n); } + intvec operator>>(intvec n) const { return v>>n; } + intvec operator<<(intvec n) const { return v<>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this<(intvec const& x) const { return v>x.v; } + boolvec_t operator>=(intvec const& x) const { return v>=x.v; } + }; + + + + template<> + struct realvec: floatprops + { + static int const size = 1; + typedef real_t scalar_t; + typedef double vector_t; + static int const alignment = sizeof(vector_t); + + static char const* name() { return ""; } + void barrier() { __asm__("": "+x" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + private: + static __m128d from_double(double a) { return _mm_set_sd(a); } + static double to_double(__m128d a) { return _mm_cvtsd_f64(a); } + public: + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(real_t a): v(a) {} + realvec(real_t const* as): v(as[0]) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { return v; } + realvec& set_elt(int n, real_t a) { return v=a, *this; } + + + + typedef vecmathlib::mask_t mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return *p; + } + static realvec_t loadu(real_t const* p) + { + return *p; + } + static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loada(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return *this; + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return *this; + } + } + realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loada(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + *p = v; + } + void storeu(real_t* p) const + { + *p = v; + } + void storeu(real_t* p, std::ptrdiff_t ioff) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + storea(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } + } + void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + storea(p+ioff, m); + } + + + + intvec_t as_int() const { return floatprops::as_int(v); } + intvec_t convert_int() const { +#ifdef __x86_64__ + return _mm_cvttsd_si64(_mm_set_sd(v)); +#else + return floatprops::convert_int(v); +#endif + } + + + + realvec operator+() const { return +v; } + realvec operator-() const { return -v; } + + realvec operator+(realvec x) const { return v+x.v; } + realvec operator-(realvec x) const { return v-x.v; } + realvec operator*(realvec x) const { return v*x.v; } + realvec operator/(realvec x) const { return v/x.v; } + + realvec& operator+=(realvec const& x) { return *this=*this+x; } + realvec& operator-=(realvec const& x) { return *this=*this-x; } + realvec& operator*=(realvec const& x) { return *this=*this*x; } + realvec& operator/=(realvec const& x) { return *this=*this/x; } + + real_t prod() const { return v; } + real_t sum() const { return v; } + + + + boolvec_t operator==(realvec const& x) const { return v==x.v; } + boolvec_t operator!=(realvec const& x) const { return v!=x.v; } + boolvec_t operator<(realvec const& x) const { return v(realvec const& x) const { return v>x.v; } + boolvec_t operator>=(realvec const& x) const { return v>=x.v; } + + + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const + { +#ifdef __SSE4_1__ + return to_double(_mm_ceil_sd(from_double(v), from_double(v))); +#else + return std::ceil(v); +#endif + } + realvec copysign(realvec y) const { return std::copysign(v, y.v); } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return std::fabs(v); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const + { +#ifdef __SSE4_1__ + return to_double(_mm_floor_sd(from_double(v), from_double(v))); +#else + return std::floor(v); +#endif + } + realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); } + realvec fmax(realvec y) const + { + return to_double(_mm_max_sd(from_double(v), from_double(y.v))); + } + realvec fmin(realvec y) const + { + return to_double(_mm_min_sd(from_double(v), from_double(y.v))); + } + realvec fmod(realvec y) const { return std::fmod(v, y.v); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const + { + int_t r = std::ilogb(v); + if (r == FP_ILOGB0) r = numeric_limits::min(); + else if (r == FP_ILOGBNAN) r = numeric_limits::max(); + return r; + } + boolvec_t isfinite() const { return std::isfinite(v); } + boolvec_t isinf() const { return std::isinf(v); } + boolvec_t isnan() const + { + return _mm_ucomineq_sd(from_double(v), from_double(v)); + } + boolvec_t isnormal() const { return std::isnormal(v); } + realvec ldexp(int_t n) const { return std::ldexp(v, n); } + realvec ldexp(intvec_t n) const { return std::ldexp(v, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const { return R(1.0)/v; } + realvec remainder(realvec y) const { return std::remainder(v, y.v); } + realvec rint() const + { +#ifdef __SSE4_1__ + return to_double(_mm_round_sd(from_double(v), from_double(v), + _MM_FROUND_TO_NEAREST_INT)); +#else + return MF::vml_rint(*this); +#endif + } + realvec round() const { return MF::vml_round(*this); } + realvec rsqrt() const { return MF::vml_rsqrt(*this); } + boolvec_t signbit() const { return std::signbit(v); } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + realvec sqrt() const + { + return to_double(_mm_sqrt_sd(from_double(v), from_double(v))); + } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const + { +#ifdef __SSE4_1__ + return to_double(_mm_round_sd(from_double(v), from_double(v), + _MM_FROUND_TO_ZERO)); +#else + return MF::vml_trunc(*this); +#endif + } + }; + + + + // boolvec definitions + + inline + auto boolvec::as_int() const -> intvec_t + { + return I(v); + } + + inline + auto boolvec::convert_int() const -> intvec_t + { + return v; + } + + inline + auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return v ? x : y; + } + + inline + auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { + return v ? x : y; + } + + + + // intvec definitions + + inline auto intvec::as_float() const -> realvec_t + { + return FP::as_float(v); + } + + inline auto intvec::convert_float() const -> realvec_t + { +#ifdef __x86_64__ + return _mm_cvtsd_f64(_mm_cvtsi64_sd(_mm_setzero_pd(), v)); +#else + return FP::convert_float(v); +#endif + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_SSE_DOUBLE1_H diff --git a/vec_sse_double2.h b/vec_sse_double2.h new file mode 100644 index 0000000..2cbbd38 --- /dev/null +++ b/vec_sse_double2.h @@ -0,0 +1,646 @@ +// -*-C++-*- + +#ifndef VEC_SSE_DOUBLE2_H +#define VEC_SSE_DOUBLE2_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include + +// SSE2 intrinsics +#include +#ifdef __SSE3__ // Intel's SSE 3 +# include +#endif +#ifdef __SSE4_1__ // Intel's SSE 4.1 +# include +#endif +#ifdef __SSE4A__ // AMD's SSE 4a +# include +#endif +#if defined __AVX__ // Intel's AVX +# include +#endif + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_DOUBLE_2 + template<> struct boolvec; + template<> struct intvec; + template<> struct realvec; + + + + template<> + struct boolvec: floatprops + { + static int const size = 2; + typedef bool scalar_t; + typedef __m128d bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return - uint_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + public: + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): + v(_mm_castsi128_pd(_mm_set1_epi64x(from_bool(a)))) {} + boolvec(bool const* as): + v(_mm_castsi128_pd(_mm_set_epi64x(from_bool(as[1]), from_bool(as[0])))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } + boolvec& set_elt(int n, bool a) + { + return ((uint_t*)&v)[n]=from_bool(a), *this; + } + + + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + + + boolvec operator!() const { return _mm_xor_pd(boolvec(true), v); } + + boolvec operator&&(boolvec x) const { return _mm_and_pd(v, x.v); } + boolvec operator||(boolvec x) const { return _mm_or_pd(v, x.v); } + boolvec operator==(boolvec x) const { return !(*this!=x); } + boolvec operator!=(boolvec x) const { return _mm_xor_pd(v, x.v); } + + bool all() const + { + // return (*this)[0] && (*this)[1]; +#if defined __AVX__ + return ! (! *this).any(); +#else + boolvec x = *this; + x = x && _mm_shuffle_pd(x.v, x.v, _MM_SHUFFLE2(0,1)); + return x[0]; +#endif + } + bool any() const + { + // return (*this)[0] || (*this)[1]; +#if defined __AVX__ + return ! _mm_testz_pd(v, v); +#else + boolvec x = *this; + x = x || _mm_shuffle_pd(x.v, x.v, _MM_SHUFFLE2(0,1)); + return x[0]; +#endif + } + + + + // ifthen(condition, then-value, else-value) + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec + }; + + + + template<> + struct intvec: floatprops + { + static int const size = 2; + typedef int_t scalar_t; + typedef __m128i ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x): v(x) {} + intvec(int_t a): v(_mm_set1_epi64x(a)) {} + intvec(int_t const* as): v(_mm_set_epi64x(as[1], as[0])) {} + static intvec iota() { return _mm_set_epi64x(1, 0); } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return ((int_t const*)&v)[n]; } + intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } + + + + boolvec_t as_bool() const { return _mm_castsi128_pd(v); } + boolvec_t convert_bool() const + { + // Result: convert_bool(0)=false, convert_bool(else)=true + // There is no intrinsic to compare with zero. Instead, we check + // whether x is positive and x-1 is negative. + intvec x = *this; + // We know that boolvec values depend only on the sign bit + // return (~(x-1) | x).as_bool(); + // return x.as_bool() || !(x-1).as_bool(); + return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + + + // Note: not all arithmetic operations are supported! + + intvec operator+() const { return *this; } + intvec operator-() const { return IV(I(0)) - *this; } + + intvec operator+(intvec x) const { return _mm_add_epi64(v, x.v); } + intvec operator-(intvec x) const { return _mm_sub_epi64(v, x.v); } + + intvec& operator+=(intvec const& x) { return *this=*this+x; } + intvec& operator-=(intvec const& x) { return *this=*this-x; } + + + + intvec operator~() const { return IV(~U(0)) ^ *this; } + + intvec operator&(intvec x) const + { + return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(v), + _mm_castsi128_pd(x.v))); + } + intvec operator|(intvec x) const + { + return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(v), + _mm_castsi128_pd(x.v))); + } + intvec operator^(intvec x) const + { + return _mm_castpd_si128(_mm_xor_pd(_mm_castsi128_pd(v), + _mm_castsi128_pd(x.v))); + } + + intvec& operator&=(intvec const& x) { return *this=*this&x; } + intvec& operator|=(intvec const& x) { return *this=*this|x; } + intvec& operator^=(intvec const& x) { return *this=*this^x; } + + + + intvec lsr(int_t n) const { return _mm_srli_epi64(v, n); } + intvec operator>>(int_t n) const + { + // There is no _mm_srai_epi64. To emulate it, add 0x80000000 + // before shifting, and subtract the shifted 0x80000000 after + // shifting + intvec x = *this; + // Convert signed to unsiged + x += U(1) << (bits-1); + // Shift + x = x.lsr(n); + // Undo conversion + x -= U(1) << (bits-1-n); + return x; + } + intvec operator<<(int_t n) const { return _mm_slli_epi64(v, n); } + intvec& operator>>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); + } + return r; + } + intvec operator>>(intvec n) const + { + intvec r; + for (int i=0; i> n[i]); + } + return r; + } + intvec operator<<(intvec n) const + { + intvec r; + for (int i=0; i>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this< x); + } + boolvec_t operator>(intvec const& x) const + { + return x < *this; + } + boolvec_t operator>=(intvec const& x) const + { + return ! (*this < x); + } + }; + + + + template<> + struct realvec: floatprops + { + static int const size = 2; + typedef real_t scalar_t; + typedef __m128d vector_t; + static int const alignment = sizeof(vector_t); + + static char const* name() { return ""; } + void barrier() { __asm__("": "+x" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x): v(x) {} + realvec(real_t a): v(_mm_set1_pd(a)) {} + realvec(real_t const* as): v(_mm_set_pd(as[1], as[0])) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { return ((real_t const*)&v)[n]; } + realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } + + + + typedef vecmathlib::mask_t mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm_load_pd(p); + } + static realvec_t loadu(real_t const* p) + { + return _mm_loadu_pd(p); + } + static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm_store_pd(p, v); + } + void storeu(real_t* p) const + { + return _mm_storeu_pd(p, v); + } + void storeu(real_t* p, std::ptrdiff_t ioff) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { +#if defined __AVX__ + _mm_maskstore_pd(p, m.m.as_int(), v); +#else + if (m.m[0]) _mm_storel_pd(p , v); + else if (m.m[1]) _mm_storeh_pd(p+1, v); +#endif + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) _mm_storel_pd(p , v); + else if (m.m[1]) _mm_storeh_pd(p+1, v); + } + } + void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff, m); + storeu(p+ioff, m); + } + + + + intvec_t as_int() const { return _mm_castpd_si128(v); } + intvec_t convert_int() const { return MF::vml_convert_int(*this); } + + + + realvec operator+() const { return *this; } + realvec operator-() const { return RV(0.0) - *this; } + + realvec operator+(realvec x) const { return _mm_add_pd(v, x.v); } + realvec operator-(realvec x) const { return _mm_sub_pd(v, x.v); } + realvec operator*(realvec x) const { return _mm_mul_pd(v, x.v); } + realvec operator/(realvec x) const { return _mm_div_pd(v, x.v); } + + realvec& operator+=(realvec const& x) { return *this=*this+x; } + realvec& operator-=(realvec const& x) { return *this=*this-x; } + realvec& operator*=(realvec const& x) { return *this=*this*x; } + realvec& operator/=(realvec const& x) { return *this=*this/x; } + + real_t prod() const + { + return (*this)[0] * (*this)[1]; + } + real_t sum() const + { +#ifdef __SSE3__ + return _mm_cvtsd_f64(_mm_hadd_pd(v, v)); +#else + return (*this)[0] + (*this)[1]; +#endif + } + + + + boolvec_t operator==(realvec const& x) const + { + return _mm_cmpeq_pd(v, x.v); + } + boolvec_t operator!=(realvec const& x) const + { + return _mm_cmpneq_pd(v, x.v); + } + boolvec_t operator<(realvec const& x) const + { + return _mm_cmplt_pd(v, x.v); + } + boolvec_t operator<=(realvec const& x) const + { + return _mm_cmple_pd(v, x.v); + } + boolvec_t operator>(realvec const& x) const + { + return _mm_cmpgt_pd(v, x.v); + } + boolvec_t operator>=(realvec const& x) const + { + return _mm_cmpge_pd(v, x.v); + } + + + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const + { +#ifdef __SSE4_1__ + return _mm_ceil_pd(v); +#else + return MF::vml_ceil(*this); +#endif + } + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return MF::vml_fabs(*this); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const + { +#ifdef __SSE4_1__ + return _mm_floor_pd(v); +#else + return MF::vml_floor(*this); +#endif + } + realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); } + realvec fmax(realvec y) const { return _mm_max_pd(v, y.v); } + realvec fmin(realvec y) const { return _mm_min_pd(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return _mm_cmpunord_pd(v, v);; } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const { return _mm_div_pd(_mm_set1_pd(1.0), v); } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const + { +#ifdef __SSE4_1__ + return _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT); +#else + return MF::vml_rint(*this); +#endif + } + realvec round() const { return MF::vml_round(*this); } + realvec rsqrt() const { return MF::vml_rsqrt(*this); } + boolvec_t signbit() const { return v; } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + realvec sqrt() const { return _mm_sqrt_pd(v); } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const + { +#ifdef __SSE4_1__ + return _mm_round_pd(v, _MM_FROUND_TO_ZERO); +#else + return MF::vml_trunc(*this); +#endif + } + }; + + + + // boolvec definitions + + inline + auto boolvec::as_int() const -> intvec_t + { + return _mm_castpd_si128(v); + } + + inline + auto boolvec::convert_int() const -> intvec_t + { + //return ifthen(v, U(1), U(0)); + return lsr(as_int(), bits-1); + } + + inline + auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return ifthen(x.as_float(), y.as_float()).as_int(); + } + + inline + auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { +#ifdef __SSE4_1__ + return _mm_blendv_pd(y.v, x.v, v); +#else + return (( -convert_int() & x.as_int()) | + (~-convert_int() & y.as_int())).as_float(); +#endif + } + + + + // intvec definitions + + inline auto intvec::as_float() const -> realvec_t + { + return _mm_castsi128_pd(v); + } + + inline auto intvec::convert_float() const -> realvec_t + { + return MF::vml_convert_float(*this); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_SSE_DOUBLE2_H diff --git a/vec_sse_float1.h b/vec_sse_float1.h new file mode 100644 index 0000000..9e3d12e --- /dev/null +++ b/vec_sse_float1.h @@ -0,0 +1,523 @@ +// -*-C++-*- + +#ifndef VEC_SSE2_FLOAT1_H +#define VEC_SSE2_FLOAT1_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include + +// SSE2 intrinsics +#include +#ifdef __SSE3__ // Intel's SSE 3 +# include +#endif +#ifdef __SSE4_1__ // Intel's SSE 4.1 +# include +#endif +#ifdef __SSE4A__ // AMD's SSE 4a +# include +#endif +#if defined __AVX__ // Intel's AVX +# include +#endif + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_FLOAT_1 + template<> struct boolvec; + template<> struct intvec; + template<> struct realvec; + + + + template<> + struct boolvec: floatprops + { + static int const size = 1; + typedef bool scalar_t; + typedef uint_t bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + // true values are non-zero, false values are zero + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): v(a) {} + // TODO: remove this + boolvec(int x): v(x) {} + boolvec(bool const* as): v(as[0]) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return v; } + boolvec& set_elt(int n, bool a) { return v=a, *this; } + + + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + + + boolvec operator!() const { return !v; } + + boolvec operator&&(boolvec x) const { return v && x.v; } + boolvec operator||(boolvec x) const { return v || x.v; } + boolvec operator==(boolvec x) const { return bool(v) == bool(x.v); } + boolvec operator!=(boolvec x) const { return bool(v) != bool(x.v); } + + bool all() const { return v; } + bool any() const { return v; } + + + + // ifthen(condition, then-value, else-value) + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec + }; + + + + template<> + struct intvec: floatprops + { + static int const size = 1; + typedef int_t scalar_t; + typedef int_t ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(int_t a): v(a) {} + intvec(int_t const* as): v(as[0]) {} + static intvec iota() { return intvec(I(0)); } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return v; } + intvec& set_elt(int n, int_t a) { return v=a, *this; } + + + + boolvec_t as_bool() const { return U(v); } + boolvec_t convert_bool() const { return bool(v); } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + + + intvec operator+() const { return +v; } + intvec operator-() const { return -v; } + + intvec operator+(intvec x) const { return v+x.v; } + intvec operator-(intvec x) const { return v-x.v; } + intvec operator*(intvec x) const { return v*x.v; } + intvec operator/(intvec x) const { return v/x.v; } + intvec operator%(intvec x) const { return v%x.v; } + + intvec& operator+=(intvec const& x) { return *this=*this+x; } + intvec& operator-=(intvec const& x) { return *this=*this-x; } + intvec& operator*=(intvec const& x) { return *this=*this*x; } + intvec& operator/=(intvec const& x) { return *this=*this/x; } + intvec& operator%=(intvec const& x) { return *this=*this%x; } + + + + intvec operator~() const { return ~v; } + + intvec operator&(intvec x) const { return v&x.v; } + intvec operator|(intvec x) const { return v|x.v; } + intvec operator^(intvec x) const { return v^x.v; } + + intvec& operator&=(intvec const& x) { return *this=*this&x; } + intvec& operator|=(intvec const& x) { return *this=*this|x; } + intvec& operator^=(intvec const& x) { return *this=*this^x; } + + + + intvec lsr(int_t n) const { return U(v) >> U(n); } + intvec operator>>(int_t n) const { return v>>n; } + intvec operator<<(int_t n) const { return v<>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<> U(n); } + intvec operator>>(intvec n) const { return v>>n; } + intvec operator<<(intvec n) const { return v<>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this<(intvec const& x) const { return v>x.v; } + boolvec_t operator>=(intvec const& x) const { return v>=x.v; } + }; + + + + template<> + struct realvec: floatprops + { + static int const size = 1; + typedef real_t scalar_t; + typedef float vector_t; + static int const alignment = sizeof(vector_t); + + static char const* name() { return ""; } + void barrier() { __asm__("": "+x" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + private: + static __m128 from_float(float a) { return _mm_set_ss(a); } + static float to_float(__m128 a) { return _mm_cvtss_f32(a); } + public: + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(real_t a): v(a) {} + realvec(real_t const* as): v(as[0]) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { return v; } + realvec& set_elt(int n, real_t a) { return v=a, *this; } + + + + typedef vecmathlib::mask_t mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return *p; + } + static realvec_t loadu(real_t const* p) + { + return *p; + } + static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loada(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return *this; + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return *this; + } + } + realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return loada(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + *p = v; + } + void storeu(real_t* p) const + { + *p = v; + } + void storeu(real_t* p, std::ptrdiff_t ioff) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + storea(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } + } + void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + storea(p+ioff, m); + } + + + + intvec_t as_int() const { return floatprops::as_int(v); } + intvec_t convert_int() const { + // return floatprops::convert_int(v); + return _mm_cvttss_si32(_mm_set_ss(v)); + } + + + + realvec operator+() const { return +v; } + realvec operator-() const { return -v; } + + realvec operator+(realvec x) const { return v+x.v; } + realvec operator-(realvec x) const { return v-x.v; } + realvec operator*(realvec x) const { return v*x.v; } + realvec operator/(realvec x) const { return v/x.v; } + + realvec& operator+=(realvec const& x) { return *this=*this+x; } + realvec& operator-=(realvec const& x) { return *this=*this-x; } + realvec& operator*=(realvec const& x) { return *this=*this*x; } + realvec& operator/=(realvec const& x) { return *this=*this/x; } + + real_t prod() const { return v; } + real_t sum() const { return v; } + + + + boolvec_t operator==(realvec const& x) const { return v==x.v; } + boolvec_t operator!=(realvec const& x) const { return v!=x.v; } + boolvec_t operator<(realvec const& x) const { return v(realvec const& x) const { return v>x.v; } + boolvec_t operator>=(realvec const& x) const { return v>=x.v; } + + + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const + { +#ifdef __SSE4_1__ + return to_float(_mm_ceil_ss(from_float(v), from_float(v))); +#else + return std::ceil(v); +#endif + } + realvec copysign(realvec y) const { return std::copysign(v, y.v); } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return std::fabs(v); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const + { +#ifdef __SSE4_1__ + return to_float(_mm_floor_ss(from_float(v), from_float(v))); +#else + return std::floor(v); +#endif + } + realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); } + realvec fmax(realvec y) const + { + return to_float(_mm_max_ss(from_float(v), from_float(y.v))); + } + realvec fmin(realvec y) const + { + return to_float(_mm_min_ss(from_float(v), from_float(y.v))); + } + realvec fmod(realvec y) const { return std::fmod(v, y.v); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const + { + int_t r = std::ilogb(v); + if (r == FP_ILOGB0) r = numeric_limits::min(); + else if (r == FP_ILOGBNAN) r = numeric_limits::max(); + return r; + } + boolvec_t isfinite() const { return std::isfinite(v); } + boolvec_t isinf() const { return std::isinf(v); } + boolvec_t isnan() const + { + return _mm_ucomineq_ss(from_float(v), from_float(v)); + } + boolvec_t isnormal() const { return std::isnormal(v); } + realvec ldexp(int_t n) const { return std::ldexp(v, n); } + realvec ldexp(intvec_t n) const { return std::ldexp(v, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const { return R(1.0)/v; } + realvec remainder(realvec y) const { return std::remainder(v, y.v); } + realvec rint() const + { +#ifdef __SSE4_1__ + return to_float(_mm_round_ss(from_float(v), from_float(v), + _MM_FROUND_TO_NEAREST_INT)); +#else + return MF::vml_rint(*this); +#endif + } + realvec round() const { return MF::vml_round(*this); } + realvec rsqrt() const { return MF::vml_rsqrt(*this); } + boolvec_t signbit() const { return std::signbit(v); } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + // realvec sqrt1() const { return std::sqrt(v); } + realvec sqrt() const { return to_float(_mm_sqrt_ss(from_float(v))); } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const + { +#ifdef __SSE4_1__ + return to_float(_mm_round_ss(from_float(v), from_float(v), + _MM_FROUND_TO_ZERO)); +#else + return MF::vml_trunc(*this); +#endif + } + }; + + + + // boolvec definitions + + inline + auto boolvec::as_int() const -> intvec_t + { + return I(v); + } + + inline + auto boolvec::convert_int() const -> intvec_t + { + return v; + } + + inline + auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return v ? x : y; + } + + inline + auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { + return v ? x : y; + } + + + + // intvec definitions + + inline auto intvec::as_float() const -> realvec_t + { + return FP::as_float(v); + } + + inline auto intvec::convert_float() const -> realvec_t + { + // return FP::convert_float(v); + return _mm_cvtss_f32(_mm_cvtsi32_ss(_mm_setzero_ps(), v)); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_SSE2_FLOAT1_H diff --git a/vec_sse_float4.h b/vec_sse_float4.h new file mode 100644 index 0000000..5259cb2 --- /dev/null +++ b/vec_sse_float4.h @@ -0,0 +1,651 @@ +// -*-C++-*- + +#ifndef VEC_SSE_FLOAT4_H +#define VEC_SSE_FLOAT4_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include + +// SSE2 intrinsics +#include +#ifdef __SSE3__ // Intel's SSE 3 +# include +#endif +#if defined __SSE4_1__ // Intel's SSE 4.1 +# include +#endif +#if defined __SSE4A__ // AMD's SSE 4a +# include +#endif +#if defined __AVX__ // Intel's AVX +# include +#endif + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_FLOAT_4 + template<> struct boolvec; + template<> struct intvec; + template<> struct realvec; + + + + template<> + struct boolvec: floatprops + { + static int const size = 4; + typedef bool scalar_t; + typedef __m128 bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return - int_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + public: + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): + v(_mm_castsi128_ps(_mm_set1_epi32(from_bool(a)))) {} + boolvec(bool const* as): + v(_mm_castsi128_ps(_mm_set_epi32(from_bool(as[3]), + from_bool(as[2]), + from_bool(as[1]), + from_bool(as[0])))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } + boolvec& set_elt(int n, bool a) + { + return ((uint_t*)&v)[n]=from_bool(a), *this; + } + + + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + + + boolvec operator!() const { return _mm_xor_ps(boolvec(true), v); } + + boolvec operator&&(boolvec x) const { return _mm_and_ps(v, x.v); } + boolvec operator||(boolvec x) const { return _mm_or_ps(v, x.v); } + boolvec operator==(boolvec x) const { return !(*this!=x); } + boolvec operator!=(boolvec x) const { return _mm_xor_ps(v, x.v); } + + bool all() const + { + // return (*this)[0] && (*this)[1] && (*this)[2] && (*this)[3]; +#if defined __AVX__ + return ! (! *this).any(); +#else + boolvec x = *this; + x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(1,0,3,2)); + x = x && _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1)); + return x[0]; +#endif + } + bool any() const + { + // return (*this)[0] || (*this)[1] || (*this)[2] || (*this)[3]; +#if defined __AVX__ + return ! _mm_testz_ps(v, v); +#else + boolvec x = *this; + x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(1,0,3,2)); + x = x || _mm_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2,3,0,1)); + return x[0]; +#endif + } + + + + // ifthen(condition, then-value, else-value) + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec + }; + + + + template<> + struct intvec: floatprops + { + static int const size = 4; + typedef int_t scalar_t; + typedef __m128i ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x): v(x) {} + intvec(int_t a): v(_mm_set1_epi32(a)) {} + intvec(int_t const* as): v(_mm_set_epi32(as[3], as[2], as[1], as[0])) {} + static intvec iota() { return _mm_set_epi32(3, 2, 1, 0); } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return ((int_t const*)&v)[n]; } + intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } + + + + boolvec_t as_bool() const { return _mm_castsi128_ps(v); } + boolvec_t convert_bool() const + { + // Result: convert_bool(0)=false, convert_bool(else)=true + return ! IV(_mm_cmpeq_epi32(v, IV(0))).as_bool(); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + + + // Note: not all arithmetic operations are supported! + + intvec operator+() const { return *this; } + intvec operator-() const { return IV(0) - *this; } + + intvec operator+(intvec x) const { return _mm_add_epi32(v, x.v); } + intvec operator-(intvec x) const { return _mm_sub_epi32(v, x.v); } + + intvec& operator+=(intvec const& x) { return *this=*this+x; } + intvec& operator-=(intvec const& x) { return *this=*this-x; } + + + + intvec operator~() const { return IV(~U(0)) ^ *this; } + + intvec operator&(intvec x) const + { + return _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), + _mm_castsi128_ps(x.v))); + } + intvec operator|(intvec x) const + { + return _mm_castps_si128(_mm_or_ps(_mm_castsi128_ps(v), + _mm_castsi128_ps(x.v))); + } + intvec operator^(intvec x) const + { + return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(v), + _mm_castsi128_ps(x.v))); + } + + intvec& operator&=(intvec const& x) { return *this=*this&x; } + intvec& operator|=(intvec const& x) { return *this=*this|x; } + intvec& operator^=(intvec const& x) { return *this=*this^x; } + + + + intvec lsr(int_t n) const { return _mm_srli_epi32(v, n); } + intvec operator>>(int_t n) const { return _mm_srai_epi32(v, n); } + intvec operator<<(int_t n) const { return _mm_slli_epi32(v, n); } + intvec& operator>>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); + } + return r; + } + intvec operator>>(intvec n) const + { + intvec r; + for (int i=0; i> n[i]); + } + return r; + } + intvec operator<<(intvec n) const + { + intvec r; + for (int i=0; i>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this< x); + } + boolvec_t operator>(intvec const& x) const + { + return x < *this; + } + boolvec_t operator>=(intvec const& x) const + { + return ! (*this < x); + } + }; + + + + template<> + struct realvec: floatprops + { + static int const size = 4; + typedef real_t scalar_t; + typedef __m128 vector_t; + static int const alignment = sizeof(vector_t); + + static char const* name() { return ""; } + void barrier() { __asm__("": "+x" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x): v(x) {} + realvec(real_t a): v(_mm_set1_ps(a)) {} + realvec(real_t const* as): v(_mm_set_ps(as[3], as[2], as[1], as[0])) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { return ((real_t const*)&v)[n]; } + realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } + + + + typedef vecmathlib::mask_t mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return _mm_load_ps(p); + } + static realvec_t loadu(real_t const* p) + { + return _mm_loadu_ps(p); + } + static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff); + if (ioff==0) return loada(p); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + _mm_store_ps(p, v); + } + void storeu(real_t* p) const + { + return _mm_storeu_ps(p, v); + } + void storeu(real_t* p, std::ptrdiff_t ioff) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { +#if defined __AVX__ + _mm_maskstore_ps(p, m.m.as_int(), v); +#else + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + if (m.m[2]) p[2] = (*this)[2]; + if (m.m[3]) p[3] = (*this)[3]; +#endif + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + if (m.m[2]) p[2] = (*this)[2]; + if (m.m[3]) p[3] = (*this)[3]; + } + } + void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff, m); + storeu(p+ioff, m); + } + + + + intvec_t as_int() const { return _mm_castps_si128(v); } + intvec_t convert_int() const { return _mm_cvttps_epi32(v); } + + + + realvec operator+() const { return *this; } + realvec operator-() const { return RV(0.0) - *this; } + + realvec operator+(realvec x) const { return _mm_add_ps(v, x.v); } + realvec operator-(realvec x) const { return _mm_sub_ps(v, x.v); } + realvec operator*(realvec x) const { return _mm_mul_ps(v, x.v); } + realvec operator/(realvec x) const { return _mm_div_ps(v, x.v); } + + realvec& operator+=(realvec const& x) { return *this=*this+x; } + realvec& operator-=(realvec const& x) { return *this=*this-x; } + realvec& operator*=(realvec const& x) { return *this=*this*x; } + realvec& operator/=(realvec const& x) { return *this=*this/x; } + + real_t prod() const + { + return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; + } + real_t sum() const + { +#ifdef __SSE3__ + realvec x = *this; + x = _mm_hadd_ps(x.v, x.v); + x = _mm_hadd_ps(x.v, x.v); + return x[0]; +#else + return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; +#endif + } + + + + boolvec_t operator==(realvec const& x) const + { + return _mm_cmpeq_ps(v, x.v); + } + boolvec_t operator!=(realvec const& x) const + { + return _mm_cmpneq_ps(v, x.v); + } + boolvec_t operator<(realvec const& x) const + { + return _mm_cmplt_ps(v, x.v); + } + boolvec_t operator<=(realvec const& x) const + { + return _mm_cmple_ps(v, x.v); + } + boolvec_t operator>(realvec const& x) const + { + return _mm_cmpgt_ps(v, x.v); + } + boolvec_t operator>=(realvec const& x) const + { + return _mm_cmpge_ps(v, x.v); + } + + + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const + { +#ifdef __SSE4_1__ + return _mm_ceil_ps(v); +#else + return MF::vml_ceil(*this); +#endif + } + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return MF::vml_fabs(*this); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const + { +#ifdef __SSE4_1__ + return _mm_floor_ps(v); +#else + return MF::vml_floor(*this); +#endif + } + realvec fma(realvec y, realvec z) const { return MF::vml_fma(*this, y, z); } + realvec fmax(realvec y) const { return _mm_max_ps(v, y.v); } + realvec fmin(realvec y) const { return _mm_min_ps(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return _mm_cmpunord_ps(v, v);; } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const + { + realvec x = *this; + realvec r = _mm_rcp_ps(x); // this is only an approximation + r *= RV(2.0) - r*x; // one Newton iteration (see vml_rcp) + return r; + } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const + { +#ifdef __SSE4_1__ + return _mm_round_ps(v, _MM_FROUND_TO_NEAREST_INT); +#else + return MF::vml_rint(*this); +#endif + } + realvec round() const { return MF::vml_round(*this); } + realvec rsqrt() const + { + realvec x = *this; + realvec r = _mm_rsqrt_ps(x); // this is only an approximation + r *= RV(1.5) - RV(0.5)*x * r*r; // one Newton iteration (see vml_rsqrt) + return r; + } + boolvec_t signbit() const { return v; } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + realvec sqrt() const { return _mm_sqrt_ps(v); } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const + { +#ifdef __SSE4_1__ + return _mm_round_ps(v, _MM_FROUND_TO_ZERO); +#else + return MF::vml_trunc(*this); +#endif + } + }; + + + + // boolvec definitions + + inline + auto boolvec::as_int() const -> intvec_t + { + return _mm_castps_si128(v); + } + + inline + auto boolvec::convert_int() const -> intvec_t + { + return lsr(as_int(), bits-1); + } + + inline + auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return ifthen(x.as_float(), y.as_float()).as_int(); + } + + inline + auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { +#ifdef __SSE4_1__ + return _mm_blendv_ps(y.v, x.v, v); +#else + return (( -convert_int() & x.as_int()) | + (~-convert_int() & y.as_int())).as_float(); +#endif + } + + + + // intvec definitions + + inline auto intvec::as_float() const -> realvec_t + { + return _mm_castsi128_ps(v); + } + + inline auto intvec::convert_float() const -> realvec_t + { + return _mm_cvtepi32_ps(v); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_SSE_FLOAT4_H diff --git a/vec_vsx_double2.h b/vec_vsx_double2.h new file mode 100644 index 0000000..024056d --- /dev/null +++ b/vec_vsx_double2.h @@ -0,0 +1,656 @@ +// -*-C++-*- + +#ifndef VEC_VSX_DOUBLE2_H +#define VEC_VSX_DOUBLE2_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include + +// VSX intrinsics +#include +#undef vector +#undef pixel +#undef bool + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_DOUBLE_2 + template<> struct boolvec; + template<> struct intvec; + template<> struct realvec; + + + + template<> + struct boolvec: floatprops + { + static int const size = 2; + typedef bool scalar_t; + typedef __vector __bool long long bvector_t; + static int const alignment = sizeof(bvector_t); + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + private: + // true values are -1, false values are 0 + // truth values are interpreted bit-wise + static uint_t from_bool(bool a) { return -int_t(a); } + static bool to_bool(uint_t a) { return a; } + public: + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): v(vec_splats(from_bool(a))) {} + boolvec(bool const* as) + { + for (int d=0; d + struct intvec: floatprops + { + static int const size = 2; + typedef int_t scalar_t; + typedef __vector long long ivector_t; + static int const alignment = sizeof(ivector_t); + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x): v(x) {} + intvec(int_t a): v(vec_splats(a)) {} + intvec(int_t const* as) + { + for (int d=0; d 1436 + // exchange pairs + static __vector unsigned char perm_int_swap() + { + return + (__vector unsigned char) + {4,5,6,7, 16,17,18,19, 12,13,14,15, 24,25,26,27}; + } + // 0123 4567 -> 0426 + // broadcast high elements of pairs + static __vector unsigned char perm_int_bchi() + { + return + (__vector unsigned char) + {0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27}; + } + public: + + + + intvec operator+() const { return *this; } + intvec operator-() const { return IV(I(0)) - *this; } + + intvec operator+(intvec x) const + { + // return vec_add(v, x.v); + __vector unsigned int a = (__vector unsigned int)v; + __vector unsigned int b = (__vector unsigned int)x.v; + __vector unsigned int s = vec_add(a, b); + __vector unsigned int c = vec_addc(a, b); + __vector unsigned int z = vec_xor(z, z); + c = vec_perm(c, z, perm_int_swap()); + s = vec_add(s, c); + return (__vector long long)s; + } + intvec operator-(intvec x) const + { + // return vec_sub(v, x.v); + __vector unsigned int a = (__vector unsigned int)v; + __vector unsigned int b = (__vector unsigned int)x.v; + __vector unsigned int d = vec_sub(a, b); + __vector unsigned int c = vec_subc(a, b); + c = vec_sub(vec_splats(1U), c); + __vector unsigned int z = vec_xor(z, z); + c = vec_perm(c, z, perm_int_swap()); + d = vec_sub(d, c); + return (__vector long long)d; + } + + intvec& operator+=(intvec const& x) { return *this=*this+x; } + intvec& operator-=(intvec const& x) { return *this=*this-x; } + + + + intvec operator~() const + { + return (__vector long long)vec_nor((__vector int)v, (__vector int)v); + } + + intvec operator&(intvec x) const + { + return (__vector long long)vec_and((__vector int)v, (__vector int)x.v); + } + intvec operator|(intvec x) const + { + return (__vector long long)vec_or ((__vector int)v, (__vector int)x.v); + } + intvec operator^(intvec x) const + { + return (__vector long long)vec_xor((__vector int)v, (__vector int)x.v); + } + + intvec& operator&=(intvec const& x) { return *this=*this&x; } + intvec& operator|=(intvec const& x) { return *this=*this|x; } + intvec& operator^=(intvec const& x) { return *this=*this^x; } + + + + intvec lsr(int_t n) const { return lsr(IV(n)); } + intvec operator>>(int_t n) const { return *this >> IV(n); } + intvec operator<<(int_t n) const { return *this << IV(n); } + intvec& operator>>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<> U(n[i])); + } + return r; + } + intvec operator>>(intvec n) const + { + // return vec_sra(v, (__vector unsigned long long)n.v); + intvec r; + for (int i=0; i> n[i]); + } + return r; + } + intvec operator<<(intvec n) const + { + // return vec_sl(v, (__vector unsigned long long)n.v); + intvec r; + for (int i=0; i>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this<> (bits-1)).as_bool(); + } + + boolvec_t operator==(intvec const& x) const + { + // return vec_cmpeq(v, x.v); + __vector int a = (__vector int)v; + __vector int b = (__vector int)x.v; + __vector __bool int c = vec_cmpeq(a, b); + __vector __bool int cx = vec_perm(c, c, perm_int_swap()); + __vector __bool int r = vec_and(c, cx); + return (__vector __bool long long)r; + } + boolvec_t operator!=(intvec const& x) const { return !(*this == x); } + boolvec_t operator<(intvec const& x) const + { + __vector int a = (__vector int)v; + __vector int b = (__vector int)x.v; + __vector __bool int lt = vec_cmplt(a, b); + __vector __bool int eq = vec_cmpeq(a, b); + __vector unsigned int ua = (__vector unsigned int)v; + __vector unsigned int ub = (__vector unsigned int)x.v; + __vector __bool int ult = vec_cmplt(ua, ub); + __vector __bool int ultx = vec_perm(ult, ult, perm_int_swap()); + __vector __bool int r = vec_or(lt, vec_and(eq, ultx)); + r = vec_perm(r, r, perm_int_bchi()); + return (__vector __bool long long)r; + } + boolvec_t operator<=(intvec const& x) const + { + return ! (*this > x); + } + boolvec_t operator>(intvec const& x) const + { + return x < *this; + } + boolvec_t operator>=(intvec const& x) const + { + return ! (*this < x); + } + }; + + + + template<> + struct realvec: floatprops + { + static int const size = 2; + typedef real_t scalar_t; + typedef __vector double vector_t; + static int const alignment = sizeof(vector_t); + + static char const* name() { return ""; } + void barrier() { __asm__("": "+v" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec boolvec_t; + typedef intvec intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops FP; + typedef mathfuncs MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x): v(x) {} + realvec(real_t a): v(vec_splats(a)) {} + realvec(real_t const* as) + { + for (int d=0; d mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + return vec_ld(0, (const __vector double*)p); + } + static realvec_t loadu(real_t const* p) + { + realvec_t v0 = vec_ld(0, (const __vector double*)p); + realvec_t v1 = vec_ld(15, (const __vector double*)p); + return vec_perm(v0.v, v1.v, vec_lvsl(0, p)); + } + static realvec_t loadu(real_t const* p, std::ptrdiff_t ioff) + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return loada(p+ioff, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + vec_st(v, 0, (__vector double*)p); + } + void storeu(real_t* p) const + { + // Vector stores would require vector loads, which would need to + // be atomic + // TODO: see for good ideas + p[0] = (*this)[0]; + p[1] = (*this)[1]; + } + void storeu(real_t* p, std::ptrdiff_t ioff) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + // Use vec_ste? + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + // Use vec_ste? + if (m.m[0]) p[0] = (*this)[0]; + if (m.m[1]) p[1] = (*this)[1]; + } + } + void storeu(real_t* p, std::ptrdiff_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % alignment == 0); + if (ioff % realvec::size == 0) return storea(p+ioff, m); + storeu(p+ioff, m); + } + + + + intvec_t as_int() const { return (__vector long long) v; } + intvec_t convert_int() const { return MF::vml_convert_int(*this); } + + + + realvec operator+() const { return *this; } + realvec operator-() const { return RV(0.0) - *this; } + + realvec operator+(realvec x) const { return vec_add(v, x.v); } + realvec operator-(realvec x) const { return vec_sub(v, x.v); } + realvec operator*(realvec x) const { return vec_mul(v, x.v); } + realvec operator/(realvec x) const { return vec_div(v, x.v); } + + realvec& operator+=(realvec const& x) { return *this=*this+x; } + realvec& operator-=(realvec const& x) { return *this=*this-x; } + realvec& operator*=(realvec const& x) { return *this=*this*x; } + realvec& operator/=(realvec const& x) { return *this=*this/x; } + + real_t prod() const + { + return (*this)[0] * (*this)[1] * (*this)[2] * (*this)[3]; + } + real_t sum() const + { + return (*this)[0] + (*this)[1] + (*this)[2] + (*this)[3]; + } + + + + boolvec_t operator==(realvec const& x) const { return vec_cmpeq(v, x.v); } + boolvec_t operator!=(realvec const& x) const { return ! (*this == x); } + boolvec_t operator<(realvec const& x) const { return vec_cmplt(v, x.v); } + boolvec_t operator<=(realvec const& x) const { return vec_cmple(v, x.v); } + boolvec_t operator>(realvec const& x) const { return vec_cmpgt(v, x.v); } + boolvec_t operator>=(realvec const& x) const { return vec_cmpge(v, x.v); } + + + + realvec acos() const { return MF::vml_acos(*this); } + realvec acosh() const { return MF::vml_acosh(*this); } + realvec asin() const { return MF::vml_asin(*this); } + realvec asinh() const { return MF::vml_asinh(*this); } + realvec atan() const { return MF::vml_atan(*this); } + realvec atan2(realvec y) const { return MF::vml_atan2(*this, y); } + realvec atanh() const { return MF::vml_atanh(*this); } + realvec cbrt() const { return MF::vml_cbrt(*this); } + realvec ceil() const { return vec_ceil(v); } + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec cos() const { return MF::vml_cos(*this); } + realvec cosh() const { return MF::vml_cosh(*this); } + realvec exp() const { return MF::vml_exp(*this); } + realvec exp10() const { return MF::vml_exp10(*this); } + realvec exp2() const { return MF::vml_exp2(*this); } + realvec expm1() const { return MF::vml_expm1(*this); } + realvec fabs() const { return vec_abs(v); } + realvec fdim(realvec y) const { return MF::vml_fdim(*this, y); } + realvec floor() const { return vec_floor(v); } + realvec fma(realvec y, realvec z) const { return vec_madd(v, y.v, z.v); } + realvec fmax(realvec y) const { return vec_max(v, y.v); } + realvec fmin(realvec y) const { return vec_min(v, y.v); } + realvec fmod(realvec y) const { return MF::vml_fmod(*this, y); } + realvec hypot(realvec y) const { return MF::vml_hypot(*this, y); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + realvec log() const { return MF::vml_log(*this); } + realvec log10() const { return MF::vml_log10(*this); } + realvec log1p() const { return MF::vml_log1p(*this); } + realvec log2() const { return MF::vml_log2(*this); } + realvec nextafter(realvec y) const { return MF::vml_nextafter(*this, y); } + realvec pow(realvec y) const { return MF::vml_pow(*this, y); } + realvec rcp() const + { + realvec x = *this; + realvec r = vec_re(v); // this is only an approximation + // TODO: use fma + // Note: don't rewrite this expression, this may introduce + // cancellation errors + r += r * (RV(1.0) - x*r); // two Newton iterations (see vml_rcp) + r += r * (RV(1.0) - x*r); + return r; + } + realvec remainder(realvec y) const { return MF::vml_remainder(*this, y); } + realvec rint() const { return vec_rint(v); } + realvec round() const { return MF::vml_round(*this); } + realvec rsqrt() const + { + // realvec x = *this; + // realvec r = vec_rsqrte(x.v); // this is only an approximation + // // TODO: use fma + // // one Newton iteration (see vml_rsqrt) + // r += RV(0.5)*r * (RV(1.0) - x * r*r); + // return r; + return vec_rsqrt(v); + } + boolvec_t signbit() const { return MF::vml_signbit(*this); } + realvec sin() const { return MF::vml_sin(*this); } + realvec sinh() const { return MF::vml_sinh(*this); } + // realvec sqrt() const { return *this * rsqrt(); } + realvec sqrt() const { return vec_sqrt(v); } + realvec tan() const { return MF::vml_tan(*this); } + realvec tanh() const { return MF::vml_tanh(*this); } + realvec trunc() const { return vec_trunc(v); } + }; + + + + // boolvec definitions + + inline + auto boolvec::as_int() const -> intvec_t + { + return (__vector long long) v; + } + + inline + auto boolvec::convert_int() const -> intvec_t + { + return -(__vector long long)v; + } + + inline + auto boolvec::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return vec_sel(y.v, x.v, v); + } + + inline + auto boolvec::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { + return vec_sel(y.v, x.v, v); + } + + + + // intvec definitions + + inline auto intvec::as_float() const -> realvec_t + { + return (__vector double)v; + } + + inline auto intvec::convert_float() const -> realvec_t + { + // return vec_ctd(v, 0); + return MF::vml_convert_float(*this); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_VSX_DOUBLE2_H diff --git a/vecmathlib.h b/vecmathlib.h index e79188a..3e04b19 100644 --- a/vecmathlib.h +++ b/vecmathlib.h @@ -53,35 +53,38 @@ namespace std { class type_info; } #if defined __ARM_PCS_VFP // ARM NEON // TODO: VFP -# include "vec_float_neon.h" +// TODO: vec_neon_float4 +# include "vec_neon_float2.h" #endif #if defined __SSE2__ // Intel SSE 2 -# include "vec_float_sse2_scalar.h" -# include "vec_double_sse2_scalar.h" -# include "vec_float_sse2.h" -# include "vec_double_sse2.h" +# include "vec_sse_float1.h" +# include "vec_sse_float4.h" +# include "vec_sse_double1.h" +# include "vec_sse_double2.h" #endif #if defined __AVX__ // Intel AVX -# include "vec_fp8_avx.h" -# include "vec_fp16_avx.h" -# include "vec_float_avx.h" -# include "vec_double_avx.h" +# include "vec_avx_fp8_32.h" +# include "vec_avx_fp16_16.h" +# include "vec_avx_float8.h" +# include "vec_avx_double4.h" #endif +// TODO: MIC + #if defined __ALTIVEC__ // IBM Altivec -# include "vec_float_altivec.h" +# include "vec_altivec_float4.h" #endif #if defined __VSX__ // IBM VSX -# include "vec_double_vsx.h" +# include "vec_vsx_double2.h" #endif // TODO: IBM Blue Gene/P DoubleHummer #if defined __bgq__ && defined __VECTOR4DOUBLE__ // IBM Blue Gene/Q QPX -// TODO: vec_float_qpx -# include "vec_double_qpx.h" +// TODO: vec_qpx_float4 +# include "vec_qpx_double4.h" #endif #endif // #ifndef VECMATHLIB_H -- cgit v1.1