diff options
-rw-r--r-- | floatprops.h | 140 | ||||
-rw-r--r-- | floattypes.h | 22 | ||||
-rw-r--r-- | mathfuncs_fabs.h | 4 | ||||
-rw-r--r-- | vec_fp16_avx.h | 579 | ||||
-rw-r--r-- | vec_fp8_avx.h | 645 | ||||
-rw-r--r-- | vecmathlib.h | 14 |
6 files changed, 1373 insertions, 31 deletions
diff --git a/floatprops.h b/floatprops.h index 8d4ddcf..c39c788 100644 --- a/floatprops.h +++ b/floatprops.h @@ -3,6 +3,8 @@ #ifndef FLOATPROPS_H #define FLOATPROPS_H +#include "floattypes.h" + #include <cmath> #include <cstdint> #include <cstring> @@ -27,8 +29,116 @@ namespace vecmathlib { // max_exponent }; - template<typename int_t> - struct intprops { + + + // Properties of fp8 + template<> + struct floatprops<fp8> { + typedef fp8 real_t; + typedef int8_t int_t; + typedef uint8_t uint_t; + + // Definitions that might come from numeric_limits<> instead: + static int const digits = 4; + static int epsilon() { __builtin_unreachable(); } + static int const min_exponent = -6; + static int const max_exponent = 7; + + // Ensure the sizes match + static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size"); + static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size"); + + // Number of bits in internal representation + static int const bits = 8 * sizeof(real_t); + static int const mantissa_bits = digits - 1; + static int const signbit_bits = 1; + static int const exponent_bits = bits - mantissa_bits - signbit_bits; + static int const exponent_offset = 2 - min_exponent; + static_assert(mantissa_bits + exponent_bits + signbit_bits == bits, + "error in bit counts"); + static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1; + static uint_t const exponent_mask = + ((uint_t(1) << exponent_bits) - 1) << mantissa_bits; + static uint_t const signbit_mask = uint_t(1) << (bits-1); + static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0), + "error in masks"); + static_assert((mantissa_mask | exponent_mask | signbit_mask) == + uint_t(~uint_t(0)), + "error in masks"); + + // Re-interpret bit patterns + static real_t as_float(int_t x) + { + real_t res; + std::memcpy(&res, &x, sizeof res); + return res; + } + static int_t as_int(real_t x) + { + int_t res; + std::memcpy(&res, &x, sizeof res); + return res; + } + + // Convert values + static real_t convert_float(int_t x) { __builtin_unreachable(); } + static int_t convert_int(real_t x) { __builtin_unreachable(); } + }; + + + + // Properties of fp16 + template<> + struct floatprops<fp16> { + typedef fp16 real_t; + typedef int16_t int_t; + typedef uint16_t uint_t; + + // Definitions that might come from numeric_limits<> instead: + static int const digits = 11; + static int epsilon() { __builtin_unreachable(); } + static int const min_exponent = -14; + static int const max_exponent = 15; + + // Ensure the sizes match + static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size"); + static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size"); + + // Number of bits in internal representation + static int const bits = 8 * sizeof(real_t); + static int const mantissa_bits = digits - 1; + static int const signbit_bits = 1; + static int const exponent_bits = bits - mantissa_bits - signbit_bits; + static int const exponent_offset = 2 - min_exponent; + static_assert(mantissa_bits + exponent_bits + signbit_bits == bits, + "error in bit counts"); + static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1; + static uint_t const exponent_mask = + ((uint_t(1) << exponent_bits) - 1) << mantissa_bits; + static uint_t const signbit_mask = uint_t(1) << (bits-1); + static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0), + "error in masks"); + static_assert((mantissa_mask | exponent_mask | signbit_mask) == + uint_t(~uint_t(0)), + "error in masks"); + + // Re-interpret bit patterns + static real_t as_float(int_t x) + { + real_t res; + std::memcpy(&res, &x, sizeof res); + return res; + } + static int_t as_int(real_t x) + { + int_t res; + std::memcpy(&res, &x, sizeof res); + return res; + } + + // Convert values + static real_t convert_float(int_t x) { __builtin_unreachable(); } + static int_t convert_int(real_t x) { __builtin_unreachable(); } }; @@ -66,7 +176,7 @@ namespace vecmathlib { "error in masks"); // Re-interpret bit patterns - static inline real_t as_float(int_t x) + static real_t as_float(int_t x) { // return *(real_t*)&x; // union { int_t i; real_t r; } ir; @@ -75,7 +185,7 @@ namespace vecmathlib { std::memcpy(&res, &x, sizeof res); return res; } - static inline int_t as_int(real_t x) + static int_t as_int(real_t x) { // return *(int_t*)&x; // union { real_t r; int_t i; } ri; @@ -86,8 +196,8 @@ namespace vecmathlib { } // Convert values - static inline real_t convert_float(int_t x) { return real_t(x); } - static inline int_t convert_int(real_t x) + static real_t convert_float(int_t x) { return real_t(x); } + static int_t convert_int(real_t x) { static_assert(sizeof std::lrint(x) >= sizeof(int_t), "lrint() has wrong return type"); @@ -103,11 +213,6 @@ namespace vecmathlib { } }; - template<> - struct intprops<floatprops<float>::int_t> { - typedef float real_t; - }; - // Properties of double @@ -143,7 +248,7 @@ namespace vecmathlib { "error in masks"); // Re-interpret bit patterns - static inline real_t as_float(int_t x) + static real_t as_float(int_t x) { // return *(real_t*)&x; // union { int_t i; real_t r; } ir; @@ -152,7 +257,7 @@ namespace vecmathlib { std::memcpy(&res, &x, sizeof res); return res; } - static inline int_t as_int(real_t x) + static int_t as_int(real_t x) { // return *(int_t*)&x; // union { real_t r; int_t i; } ri; @@ -163,8 +268,8 @@ namespace vecmathlib { } // Convert values - static inline real_t convert_float(int_t x) { return real_t(x); } - static inline int_t convert_int(real_t x) + static real_t convert_float(int_t x) { return real_t(x); } + static int_t convert_int(real_t x) { static_assert(sizeof std::lrint(x) >= sizeof(int_t), "lrint() has wrong return type"); @@ -180,11 +285,6 @@ namespace vecmathlib { } }; - template<> - struct intprops<floatprops<double>::int_t> { - typedef double real_t; - }; - } // namespace vecmathlib diff --git a/floattypes.h b/floattypes.h new file mode 100644 index 0000000..008f695 --- /dev/null +++ b/floattypes.h @@ -0,0 +1,22 @@ +// -*-C++-*- + +#ifndef FLOATTYPESS_H +#define FLOATTYPESS_H + +#include <cstdint> + +namespace vecmathlib { + + struct fp8 { + // 1 bit sign, 4 bits exponent, 3 bits mantissa + std::uint8_t val; + }; + + struct fp16 { + // 1 bit sign, 5 bits exponent, 10 bits mantissa + std::uint16_t val; + }; + +} // namespace vecmathlib + +#endif // #ifndef FLOATTYPES_H diff --git a/mathfuncs_fabs.h b/mathfuncs_fabs.h index 1050147..952dbef 100644 --- a/mathfuncs_fabs.h +++ b/mathfuncs_fabs.h @@ -14,7 +14,7 @@ namespace vecmathlib { template<typename realvec_t> realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y) { - intvec_t value = as_int(x) & IV(~FP::signbit_mask); + intvec_t value = as_int(x) & IV(U(~FP::signbit_mask)); intvec_t sign = as_int(y) & IV(FP::signbit_mask); return as_float(sign | value); } @@ -22,7 +22,7 @@ namespace vecmathlib { template<typename realvec_t> realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x) { - return as_float(as_int(x) & IV(~FP::signbit_mask)); + return as_float(as_int(x) & IV(U(~FP::signbit_mask))); } template<typename realvec_t> diff --git a/vec_fp16_avx.h b/vec_fp16_avx.h new file mode 100644 index 0000000..b52d67c --- /dev/null +++ b/vec_fp16_avx.h @@ -0,0 +1,579 @@ +// -*-C++-*- + +#ifndef VEC_FP16_AVX_H +#define VEC_FP16_AVX_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include <cmath> + +// AVX intrinsics +#include <immintrin.h> + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_FP16_16 + template<> struct boolvec<fp16,16>; + template<> struct intvec<fp16,16>; + template<> struct realvec<fp16,16>; + + + + template<> + struct boolvec<fp16,16>: floatprops<fp16> + { + static int const size = 16; + typedef bool scalar_t; + typedef __m256i bvector_t; + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return - uint_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + public: + + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): v(_mm256_set1_epi16(from_bool(a))) {} + boolvec(bool const* as): + v(_mm256_set_epi16(from_bool(as[15]), + from_bool(as[14]), + from_bool(as[13]), + from_bool(as[12]), + from_bool(as[11]), + from_bool(as[10]), + from_bool(as[ 9]), + from_bool(as[ 8]), + from_bool(as[ 7]), + from_bool(as[ 6]), + from_bool(as[ 5]), + from_bool(as[ 4]), + from_bool(as[ 3]), + from_bool(as[ 2]), + from_bool(as[ 1]), + from_bool(as[ 0]))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } + boolvec& set_elt(int n, bool a) + { + return ((uint_t*)&v)[n]=from_bool(a), *this; + } + + + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + + + boolvec operator!() const { return *this != boolvec(true); } + + boolvec operator&&(boolvec x) const + { + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + boolvec operator||(boolvec x) const + { + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + boolvec operator==(boolvec x) const { return !(*this==x); } + boolvec operator!=(boolvec x) const + { + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + + bool all() const + { + bool r = true; + for (int n=0; n<size; ++n) r = r && (*this)[n]; + return r; + } + bool any() const + { + bool r = false; + for (int n=0; n<size; ++n) r = r || (*this)[n]; + return r; + } + + + + // ifthen(condition, then-value, else-value) + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec + }; + + + + template<> + struct intvec<fp16,16>: floatprops<fp16> + { + static int const size = 16; + typedef int_t scalar_t; + typedef __m256i ivector_t; + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x): v(x) {} + intvec(int_t a): v(_mm256_set1_epi16(a)) {} + intvec(int_t const* as): + v(_mm256_set_epi16(as[15], + as[14], + as[13], + as[12], + as[11], + as[10], + as[ 9], + as[ 8], + as[ 7], + as[ 6], + as[ 5], + as[ 4], + as[ 3], + as[ 2], + as[ 1], + as[ 0])) {} + static intvec iota() + { + return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0); + } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return ((int_t const*)&v)[n]; } + intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } + + + + boolvec_t as_bool() const { return v; } + boolvec_t convert_bool() const + { + // Result: convert_bool(0)=false, convert_bool(else)=true + // There is no intrinsic to compare with zero. Instead, we check + // whether x is positive and x-1 is negative. + intvec x = *this; + // We know that boolvec values depend only on the sign bit + // return (~(x-1) | x).as_bool(); + // return x.as_bool() || !(x-1).as_bool(); + return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + + + // Note: not all arithmetic operations are supported! + + intvec operator+() const { return *this; } + intvec operator-() const { return IV(I(0)) - *this; } + + intvec operator+(intvec x) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_add_epi16(vlo, xvlo); + vhi = _mm_add_epi16(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator-(intvec x) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_sub_epi16(vlo, xvlo); + vhi = _mm_sub_epi16(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + + intvec& operator+=(intvec const& x) { return *this=*this+x; } + intvec& operator-=(intvec const& x) { return *this=*this-x; } + + + + intvec operator~() const { return IV(~U(0)) ^ *this; } + + intvec operator&(intvec x) const + { + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + intvec operator|(intvec x) const + { + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + intvec operator^(intvec x) const + { + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + + intvec& operator&=(intvec const& x) { return *this=*this&x; } + intvec& operator|=(intvec const& x) { return *this=*this|x; } + intvec& operator^=(intvec const& x) { return *this=*this^x; } + + + + intvec lsr(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_srli_epi16(vlo, n); + vhi = _mm_srli_epi16(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator>>(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_srai_epi16(vlo, n); + vhi = _mm_srai_epi16(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator<<(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + vlo = _mm_slli_epi16(vlo, n); + vhi = _mm_slli_epi16(vhi, n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec& operator>>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<<n; } + + intvec lsr(intvec n) const + { + intvec r = *this; + for (int i=0; i<size; ++i) { + r.set_elt(i, U(r[i]) >> U(n[i])); + } + return r; + } + intvec operator>>(intvec n) const + { + intvec r = *this; + for (int i=0; i<size; ++i) { + r.set_elt(i, r[i] >> n[i]); + } + return r; + } + intvec operator<<(intvec n) const + { + intvec r = *this; + for (int i=0; i<size; ++i) { + r.set_elt(i, r[i] << n[i]); + } + return r; + } + intvec& operator>>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this<<n; } + + + + boolvec_t operator==(intvec const& x) const + { + return ! (*this != x); + } + boolvec_t operator!=(intvec const& x) const + { + return (*this ^ x).convert_bool(); + } + }; + + + + template<> + struct realvec<fp16,16>: floatprops<fp16> + { + static int const size = 16; + typedef real_t scalar_t; + typedef __m256i vector_t; + + static char const* name() { return "<AVX:16*fp16>"; } + inline void barrier() { asm("": "+x" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x): v(x) {} + realvec(real_t a): v(_mm256_set1_epi16(FP::as_int(a))) {} + realvec(real_t const* as): + v(_mm256_set_epi16(FP::as_int(as[15]), + FP::as_int(as[14]), + FP::as_int(as[13]), + FP::as_int(as[12]), + FP::as_int(as[11]), + FP::as_int(as[10]), + FP::as_int(as[ 9]), + FP::as_int(as[ 8]), + FP::as_int(as[ 7]), + FP::as_int(as[ 6]), + FP::as_int(as[ 5]), + FP::as_int(as[ 4]), + FP::as_int(as[ 3]), + FP::as_int(as[ 2]), + FP::as_int(as[ 1]), + FP::as_int(as[ 0]))) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { return ((real_t const*)&v)[n]; } + realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } + + + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + return _mm256_load_si256((__m256i const*)p); + } + static realvec_t loadu(real_t const* p) + { + return _mm256_loadu_si256((__m256i const*)p); + } + static realvec_t loadu(real_t const* p, size_t ioff) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return loada(p); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return loada(p, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + _mm256_store_si256((__m256i*)p, v); + } + void storeu(real_t* p) const + { + return _mm256_storeu_si256((__m256i*)p, v); + } + void storeu(real_t* p, size_t ioff) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return storea(p); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + // TODO: this is expensive + for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n]; + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + // TODO: this is expensive + for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n]; + } + } + void storeu(real_t* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return storea(p, m); + storeu(p+ioff, m); + } + + + + intvec_t as_int() const { return v; } + intvec_t convert_int() const { __builtin_unreachable(); } + + + + realvec operator+() const { __builtin_unreachable(); } + realvec operator-() const { __builtin_unreachable(); } + + realvec operator+(realvec x) const { __builtin_unreachable(); } + realvec operator-(realvec x) const { __builtin_unreachable(); } + realvec operator*(realvec x) const { __builtin_unreachable(); } + realvec operator/(realvec x) const { __builtin_unreachable(); } + + realvec& operator+=(realvec const& x) { return *this=*this+x; } + realvec& operator-=(realvec const& x) { return *this=*this-x; } + realvec& operator*=(realvec const& x) { return *this=*this*x; } + realvec& operator/=(realvec const& x) { return *this=*this/x; } + + real_t prod() const { __builtin_unreachable(); } + real_t sum() const { __builtin_unreachable(); } + + + + boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); } + boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); } + boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); } + boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); } + boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); } + boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); } + + + + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec fabs() const { return MF::vml_fabs(*this); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + boolvec_t signbit() const { return v; } + }; + + + + // boolvec definitions + + inline + auto boolvec<fp16,16>::as_int() const -> intvec_t + { + return v; + } + + inline + auto boolvec<fp16,16>::convert_int() const -> intvec_t + { + return lsr(as_int(), bits-1); + } + + inline + auto boolvec<fp16,16>::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return ifthen(x.as_float(), y.as_float()).as_int(); + } + + inline + auto boolvec<fp16,16>::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { + return (( -convert_int() & x.as_int()) | + (~-convert_int() & y.as_int())).as_float(); + } + + + + // intvec definitions + + inline auto intvec<fp16,16>::as_float() const -> realvec_t + { + return v; + } + + inline auto intvec<fp16,16>::convert_float() const -> realvec_t + { + __builtin_unreachable(); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_FP16_AVX_H diff --git a/vec_fp8_avx.h b/vec_fp8_avx.h new file mode 100644 index 0000000..b612f42 --- /dev/null +++ b/vec_fp8_avx.h @@ -0,0 +1,645 @@ +// -*-C++-*- + +#ifndef VEC_FP8_AVX_H +#define VEC_FP8_AVX_H + +#include "floatprops.h" +#include "mathfuncs.h" +#include "vec_base.h" + +#include <cmath> + +// AVX intrinsics +#include <immintrin.h> + + + +namespace vecmathlib { + +#define VECMATHLIB_HAVE_VEC_FP8_32 + template<> struct boolvec<fp8,32>; + template<> struct intvec<fp8,32>; + template<> struct realvec<fp8,32>; + + + + template<> + struct boolvec<fp8,32>: floatprops<fp8> + { + static int const size = 32; + typedef bool scalar_t; + typedef __m256i bvector_t; + + static_assert(size * sizeof(real_t) == sizeof(bvector_t), + "vector size is wrong"); + + private: + // true values have the sign bit set, false values have it unset + static uint_t from_bool(bool a) { return - uint_t(a); } + static bool to_bool(uint_t a) { return int_t(a) < int_t(0); } + public: + + typedef boolvec boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + + + bvector_t v; + + boolvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // boolvec(boolvec const& x): v(x.v) {} + // boolvec& operator=(boolvec const& x) { return v=x.v, *this; } + boolvec(bvector_t x): v(x) {} + boolvec(bool a): v(_mm256_set1_epi8(from_bool(a))) {} + boolvec(bool const* as): + v(_mm256_set_epi8(from_bool(as[31]), + from_bool(as[30]), + from_bool(as[29]), + from_bool(as[28]), + from_bool(as[27]), + from_bool(as[26]), + from_bool(as[25]), + from_bool(as[24]), + from_bool(as[23]), + from_bool(as[22]), + from_bool(as[21]), + from_bool(as[20]), + from_bool(as[19]), + from_bool(as[18]), + from_bool(as[17]), + from_bool(as[16]), + from_bool(as[15]), + from_bool(as[14]), + from_bool(as[13]), + from_bool(as[12]), + from_bool(as[11]), + from_bool(as[10]), + from_bool(as[ 9]), + from_bool(as[ 8]), + from_bool(as[ 7]), + from_bool(as[ 6]), + from_bool(as[ 5]), + from_bool(as[ 4]), + from_bool(as[ 3]), + from_bool(as[ 2]), + from_bool(as[ 1]), + from_bool(as[ 0]))) {} + + operator bvector_t() const { return v; } + bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); } + boolvec& set_elt(int n, bool a) + { + return ((uint_t*)&v)[n]=from_bool(a), *this; + } + + + + intvec_t as_int() const; // defined after intvec + intvec_t convert_int() const; // defined after intvec + + + + boolvec operator!() const { return *this != boolvec(true); } + + boolvec operator&&(boolvec x) const + { + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + boolvec operator||(boolvec x) const + { + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + boolvec operator==(boolvec x) const { return !(*this==x); } + boolvec operator!=(boolvec x) const + { + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + + bool all() const + { + bool r = true; + for (int n=0; n<size; ++n) r = r && (*this)[n]; + return r; + } + bool any() const + { + bool r = false; + for (int n=0; n<size; ++n) r = r || (*this)[n]; + return r; + } + + + + // ifthen(condition, then-value, else-value) + intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec + realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec + }; + + + + template<> + struct intvec<fp8,32>: floatprops<fp8> + { + static int const size = 32; + typedef int_t scalar_t; + typedef __m256i ivector_t; + + static_assert(size * sizeof(real_t) == sizeof(ivector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec intvec_t; + typedef realvec<real_t, size> realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + + + ivector_t v; + + intvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // intvec(intvec const& x): v(x.v) {} + // intvec& operator=(intvec const& x) { return v=x.v, *this; } + intvec(ivector_t x): v(x) {} + intvec(int_t a): v(_mm256_set1_epi8(a)) {} + intvec(int_t const* as): + v(_mm256_set_epi8(as[31], + as[30], + as[29], + as[28], + as[27], + as[26], + as[25], + as[24], + as[23], + as[22], + as[21], + as[20], + as[19], + as[18], + as[17], + as[16], + as[15], + as[14], + as[13], + as[12], + as[11], + as[10], + as[ 9], + as[ 8], + as[ 7], + as[ 6], + as[ 5], + as[ 4], + as[ 3], + as[ 2], + as[ 1], + as[ 0])) {} + static intvec iota() + { + return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0); + } + + operator ivector_t() const { return v; } + int_t operator[](int n) const { return ((int_t const*)&v)[n]; } + intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; } + + + + boolvec_t as_bool() const { return v; } + boolvec_t convert_bool() const + { + // Result: convert_bool(0)=false, convert_bool(else)=true + // There is no intrinsic to compare with zero. Instead, we check + // whether x is positive and x-1 is negative. + intvec x = *this; + // We know that boolvec values depend only on the sign bit + // return (~(x-1) | x).as_bool(); + // return x.as_bool() || !(x-1).as_bool(); + return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); + } + realvec_t as_float() const; // defined after realvec + realvec_t convert_float() const; // defined after realvec + + + + // Note: not all arithmetic operations are supported! + + intvec operator+() const { return *this; } + intvec operator-() const { return IV(I(0)) - *this; } + + intvec operator+(intvec x) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_add_epi8(vlo, xvlo); + vhi = _mm_add_epi8(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator-(intvec x) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + __m128i xvlo = _mm256_castsi256_si128(x.v); + __m128i xvhi = _mm256_extractf128_si256(x.v, 1); + vlo = _mm_sub_epi8(vlo, xvlo); + vhi = _mm_sub_epi8(vhi, xvhi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + + intvec& operator+=(intvec const& x) { return *this=*this+x; } + intvec& operator-=(intvec const& x) { return *this=*this-x; } + + + + intvec operator~() const { return IV(~U(0)) ^ *this; } + + intvec operator&(intvec x) const + { + return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + intvec operator|(intvec x) const + { + return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + intvec operator^(intvec x) const + { + return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), + _mm256_castsi256_ps(x.v))); + } + + intvec& operator&=(intvec const& x) { return *this=*this&x; } + intvec& operator|=(intvec const& x) { return *this=*this|x; } + intvec& operator^=(intvec const& x) { return *this=*this^x; } + + + + intvec lsr(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + uint_t masklo = U(0x00ffU) >> U(n); + uint_t maskhi = U(0xff00U); + __m128i mask = _mm_set1_epi16(masklo | maskhi); + vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask); + vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator>>(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + uint_t masklo = U(0x00ffU); + uint_t maskhi = U(0xff00U); + __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n+8), + _mm_set1_epi16(masklo)); + __m128i vlohi = _mm_and_si128(_mm_srai_epi16(vlo, n), + _mm_set1_epi16(maskhi)); + vlo = _mm_or_si128(vlolo, vlohi); + __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n+8), + _mm_set1_epi16(masklo)); + __m128i vhihi = _mm_and_si128(_mm_srai_epi16(vhi, n), + _mm_set1_epi16(maskhi)); + vhi = _mm_or_si128(vhilo, vhihi); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec operator<<(int_t n) const + { + __m128i vlo = _mm256_castsi256_si128(v); + __m128i vhi = _mm256_extractf128_si256(v, 1); + uint_t masklo = U(0x00ffU); + uint_t maskhi = U(0xff00U) << U(n); + __m128i mask = _mm_set1_epi16(masklo | maskhi); + vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask); + vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask); + return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); + } + intvec& operator>>=(int_t n) { return *this=*this>>n; } + intvec& operator<<=(int_t n) { return *this=*this<<n; } + + intvec lsr(intvec n) const + { + intvec r = *this; + for (int i=0; i<size; ++i) { + r.set_elt(i, U(r[i]) >> U(n[i])); + } + return r; + } + intvec operator>>(intvec n) const + { + intvec r = *this; + for (int i=0; i<size; ++i) { + r.set_elt(i, r[i] >> n[i]); + } + return r; + } + intvec operator<<(intvec n) const + { + intvec r = *this; + for (int i=0; i<size; ++i) { + r.set_elt(i, r[i] << n[i]); + } + return r; + } + intvec& operator>>=(intvec n) { return *this=*this>>n; } + intvec& operator<<=(intvec n) { return *this=*this<<n; } + + + + boolvec_t operator==(intvec const& x) const + { + return ! (*this != x); + } + boolvec_t operator!=(intvec const& x) const + { + return (*this ^ x).convert_bool(); + } + }; + + + + template<> + struct realvec<fp8,32>: floatprops<fp8> + { + static int const size = 32; + typedef real_t scalar_t; + typedef __m256i vector_t; + + static char const* name() { return "<AVX:32*fp8>"; } + inline void barrier() { asm("": "+x" (v)); } + + static_assert(size * sizeof(real_t) == sizeof(vector_t), + "vector size is wrong"); + + typedef boolvec<real_t, size> boolvec_t; + typedef intvec<real_t, size> intvec_t; + typedef realvec realvec_t; + + // Short names for type casts + typedef real_t R; + typedef int_t I; + typedef uint_t U; + typedef realvec_t RV; + typedef intvec_t IV; + typedef boolvec_t BV; + typedef floatprops<real_t> FP; + typedef mathfuncs<realvec_t> MF; + + + + vector_t v; + + realvec() {} + // Can't have a non-trivial copy constructor; if so, objects won't + // be passed in registers + // realvec(realvec const& x): v(x.v) {} + // realvec& operator=(realvec const& x) { return v=x.v, *this; } + realvec(vector_t x): v(x) {} + realvec(real_t a): v(_mm256_set1_epi8(FP::as_int(a))) {} + realvec(real_t const* as): + v(_mm256_set_epi8(FP::as_int(as[31]), + FP::as_int(as[30]), + FP::as_int(as[29]), + FP::as_int(as[28]), + FP::as_int(as[27]), + FP::as_int(as[26]), + FP::as_int(as[25]), + FP::as_int(as[24]), + FP::as_int(as[23]), + FP::as_int(as[22]), + FP::as_int(as[21]), + FP::as_int(as[20]), + FP::as_int(as[19]), + FP::as_int(as[18]), + FP::as_int(as[17]), + FP::as_int(as[16]), + FP::as_int(as[15]), + FP::as_int(as[14]), + FP::as_int(as[13]), + FP::as_int(as[12]), + FP::as_int(as[11]), + FP::as_int(as[10]), + FP::as_int(as[ 9]), + FP::as_int(as[ 8]), + FP::as_int(as[ 7]), + FP::as_int(as[ 6]), + FP::as_int(as[ 5]), + FP::as_int(as[ 4]), + FP::as_int(as[ 3]), + FP::as_int(as[ 2]), + FP::as_int(as[ 1]), + FP::as_int(as[ 0]))) {} + + operator vector_t() const { return v; } + real_t operator[](int n) const { return ((real_t const*)&v)[n]; } + realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; } + + + + typedef vecmathlib::mask_t<realvec_t> mask_t; + + static realvec_t loada(real_t const* p) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + return _mm256_load_si256((__m256i const*)p); + } + static realvec_t loadu(real_t const* p) + { + return _mm256_loadu_si256((__m256i const*)p); + } + static realvec_t loadu(real_t const* p, size_t ioff) + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return loada(p); + return loadu(p+ioff); + } + realvec_t loada(real_t const* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (__builtin_expect(all(m.m), true)) { + return loada(p); + } else { + return m.m.ifthen(loada(p), *this); + } + } + realvec_t loadu(real_t const* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + return loadu(p); + } else { + return m.m.ifthen(loadu(p), *this); + } + } + realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return loada(p, m); + return loadu(p+ioff, m); + } + + void storea(real_t* p) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + _mm256_store_si256((__m256i*)p, v); + } + void storeu(real_t* p) const + { + return _mm256_storeu_si256((__m256i*)p, v); + } + void storeu(real_t* p, size_t ioff) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return storea(p); + storeu(p+ioff); + } + void storea(real_t* p, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (__builtin_expect(m.all_m, true)) { + storea(p); + } else { + // TODO: this is expensive + for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n]; + } + } + void storeu(real_t* p, mask_t const& m) const + { + if (__builtin_expect(m.all_m, true)) { + storeu(p); + } else { + // TODO: this is expensive + for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n]; + } + } + void storeu(real_t* p, size_t ioff, mask_t const& m) const + { + VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0); + if (ioff==0) return storea(p, m); + storeu(p+ioff, m); + } + + + + intvec_t as_int() const { return v; } + intvec_t convert_int() const { __builtin_unreachable(); } + + + + realvec operator+() const { __builtin_unreachable(); } + realvec operator-() const { __builtin_unreachable(); } + + realvec operator+(realvec x) const { __builtin_unreachable(); } + realvec operator-(realvec x) const { __builtin_unreachable(); } + realvec operator*(realvec x) const { __builtin_unreachable(); } + realvec operator/(realvec x) const { __builtin_unreachable(); } + + realvec& operator+=(realvec const& x) { return *this=*this+x; } + realvec& operator-=(realvec const& x) { return *this=*this-x; } + realvec& operator*=(realvec const& x) { return *this=*this*x; } + realvec& operator/=(realvec const& x) { return *this=*this/x; } + + real_t prod() const { __builtin_unreachable(); } + real_t sum() const { __builtin_unreachable(); } + + + + boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); } + boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); } + boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); } + boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); } + boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); } + boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); } + + + + realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); } + realvec fabs() const { return MF::vml_fabs(*this); } + intvec_t ilogb() const { return MF::vml_ilogb(*this); } + boolvec_t isfinite() const { return MF::vml_isfinite(*this); } + boolvec_t isinf() const { return MF::vml_isinf(*this); } + boolvec_t isnan() const { return MF::vml_isnan(*this); } + boolvec_t isnormal() const { return MF::vml_isnormal(*this); } + realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); } + realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); } + boolvec_t signbit() const { return v; } + }; + + + + // boolvec definitions + + inline + auto boolvec<fp8,32>::as_int() const -> intvec_t + { + return v; + } + + inline + auto boolvec<fp8,32>::convert_int() const -> intvec_t + { + return lsr(as_int(), bits-1); + } + + inline + auto boolvec<fp8,32>::ifthen(intvec_t x, intvec_t y) const -> intvec_t + { + return ifthen(x.as_float(), y.as_float()).as_int(); + } + + inline + auto boolvec<fp8,32>::ifthen(realvec_t x, realvec_t y) const -> realvec_t + { + return (( -convert_int() & x.as_int()) | + (~-convert_int() & y.as_int())).as_float(); + } + + + + // intvec definitions + + inline auto intvec<fp8,32>::as_float() const -> realvec_t + { + return v; + } + + inline auto intvec<fp8,32>::convert_float() const -> realvec_t + { + __builtin_unreachable(); + } + +} // namespace vecmathlib + +#endif // #ifndef VEC_FP8_AVX_H diff --git a/vecmathlib.h b/vecmathlib.h index 451a967..abfeda3 100644 --- a/vecmathlib.h +++ b/vecmathlib.h @@ -37,20 +37,16 @@ // Intel SSE 2 #if defined __SSE2__ # include "vec_float_sse2_scalar.h" -# include "vec_float_sse2.h" -#endif -// Intel AVX -#if defined __AVX__ -# include "vec_float_avx.h" -#endif - -// Intel SSE 2 -#if defined __SSE2__ # include "vec_double_sse2_scalar.h" +# include "vec_float_sse2.h" # include "vec_double_sse2.h" #endif + // Intel AVX #if defined __AVX__ +# include "vec_fp8_avx.h" +# include "vec_fp16_avx.h" +# include "vec_float_avx.h" # include "vec_double_avx.h" #endif |