summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--floatprops.h140
-rw-r--r--floattypes.h22
-rw-r--r--mathfuncs_fabs.h4
-rw-r--r--vec_fp16_avx.h579
-rw-r--r--vec_fp8_avx.h645
-rw-r--r--vecmathlib.h14
6 files changed, 1373 insertions, 31 deletions
diff --git a/floatprops.h b/floatprops.h
index 8d4ddcf..c39c788 100644
--- a/floatprops.h
+++ b/floatprops.h
@@ -3,6 +3,8 @@
#ifndef FLOATPROPS_H
#define FLOATPROPS_H
+#include "floattypes.h"
+
#include <cmath>
#include <cstdint>
#include <cstring>
@@ -27,8 +29,116 @@ namespace vecmathlib {
// max_exponent
};
- template<typename int_t>
- struct intprops {
+
+
+ // Properties of fp8
+ template<>
+ struct floatprops<fp8> {
+ typedef fp8 real_t;
+ typedef int8_t int_t;
+ typedef uint8_t uint_t;
+
+ // Definitions that might come from numeric_limits<> instead:
+ static int const digits = 4;
+ static int epsilon() { __builtin_unreachable(); }
+ static int const min_exponent = -6;
+ static int const max_exponent = 7;
+
+ // Ensure the sizes match
+ static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+ static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+ // Number of bits in internal representation
+ static int const bits = 8 * sizeof(real_t);
+ static int const mantissa_bits = digits - 1;
+ static int const signbit_bits = 1;
+ static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+ static int const exponent_offset = 2 - min_exponent;
+ static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+ "error in bit counts");
+ static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+ static uint_t const exponent_mask =
+ ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
+ static uint_t const signbit_mask = uint_t(1) << (bits-1);
+ static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+ "error in masks");
+ static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+ uint_t(~uint_t(0)),
+ "error in masks");
+
+ // Re-interpret bit patterns
+ static real_t as_float(int_t x)
+ {
+ real_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+ static int_t as_int(real_t x)
+ {
+ int_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+
+ // Convert values
+ static real_t convert_float(int_t x) { __builtin_unreachable(); }
+ static int_t convert_int(real_t x) { __builtin_unreachable(); }
+ };
+
+
+
+ // Properties of fp16
+ template<>
+ struct floatprops<fp16> {
+ typedef fp16 real_t;
+ typedef int16_t int_t;
+ typedef uint16_t uint_t;
+
+ // Definitions that might come from numeric_limits<> instead:
+ static int const digits = 11;
+ static int epsilon() { __builtin_unreachable(); }
+ static int const min_exponent = -14;
+ static int const max_exponent = 15;
+
+ // Ensure the sizes match
+ static_assert(sizeof(real_t) == sizeof(int_t), "int_t has wrong size");
+ static_assert(sizeof(real_t) == sizeof(uint_t), "uint_t has wrong size");
+
+ // Number of bits in internal representation
+ static int const bits = 8 * sizeof(real_t);
+ static int const mantissa_bits = digits - 1;
+ static int const signbit_bits = 1;
+ static int const exponent_bits = bits - mantissa_bits - signbit_bits;
+ static int const exponent_offset = 2 - min_exponent;
+ static_assert(mantissa_bits + exponent_bits + signbit_bits == bits,
+ "error in bit counts");
+ static uint_t const mantissa_mask = (uint_t(1) << mantissa_bits) - 1;
+ static uint_t const exponent_mask =
+ ((uint_t(1) << exponent_bits) - 1) << mantissa_bits;
+ static uint_t const signbit_mask = uint_t(1) << (bits-1);
+ static_assert((mantissa_mask & exponent_mask & signbit_mask) == uint_t(0),
+ "error in masks");
+ static_assert((mantissa_mask | exponent_mask | signbit_mask) ==
+ uint_t(~uint_t(0)),
+ "error in masks");
+
+ // Re-interpret bit patterns
+ static real_t as_float(int_t x)
+ {
+ real_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+ static int_t as_int(real_t x)
+ {
+ int_t res;
+ std::memcpy(&res, &x, sizeof res);
+ return res;
+ }
+
+ // Convert values
+ static real_t convert_float(int_t x) { __builtin_unreachable(); }
+ static int_t convert_int(real_t x) { __builtin_unreachable(); }
};
@@ -66,7 +176,7 @@ namespace vecmathlib {
"error in masks");
// Re-interpret bit patterns
- static inline real_t as_float(int_t x)
+ static real_t as_float(int_t x)
{
// return *(real_t*)&x;
// union { int_t i; real_t r; } ir;
@@ -75,7 +185,7 @@ namespace vecmathlib {
std::memcpy(&res, &x, sizeof res);
return res;
}
- static inline int_t as_int(real_t x)
+ static int_t as_int(real_t x)
{
// return *(int_t*)&x;
// union { real_t r; int_t i; } ri;
@@ -86,8 +196,8 @@ namespace vecmathlib {
}
// Convert values
- static inline real_t convert_float(int_t x) { return real_t(x); }
- static inline int_t convert_int(real_t x)
+ static real_t convert_float(int_t x) { return real_t(x); }
+ static int_t convert_int(real_t x)
{
static_assert(sizeof std::lrint(x) >= sizeof(int_t),
"lrint() has wrong return type");
@@ -103,11 +213,6 @@ namespace vecmathlib {
}
};
- template<>
- struct intprops<floatprops<float>::int_t> {
- typedef float real_t;
- };
-
// Properties of double
@@ -143,7 +248,7 @@ namespace vecmathlib {
"error in masks");
// Re-interpret bit patterns
- static inline real_t as_float(int_t x)
+ static real_t as_float(int_t x)
{
// return *(real_t*)&x;
// union { int_t i; real_t r; } ir;
@@ -152,7 +257,7 @@ namespace vecmathlib {
std::memcpy(&res, &x, sizeof res);
return res;
}
- static inline int_t as_int(real_t x)
+ static int_t as_int(real_t x)
{
// return *(int_t*)&x;
// union { real_t r; int_t i; } ri;
@@ -163,8 +268,8 @@ namespace vecmathlib {
}
// Convert values
- static inline real_t convert_float(int_t x) { return real_t(x); }
- static inline int_t convert_int(real_t x)
+ static real_t convert_float(int_t x) { return real_t(x); }
+ static int_t convert_int(real_t x)
{
static_assert(sizeof std::lrint(x) >= sizeof(int_t),
"lrint() has wrong return type");
@@ -180,11 +285,6 @@ namespace vecmathlib {
}
};
- template<>
- struct intprops<floatprops<double>::int_t> {
- typedef double real_t;
- };
-
} // namespace vecmathlib
diff --git a/floattypes.h b/floattypes.h
new file mode 100644
index 0000000..008f695
--- /dev/null
+++ b/floattypes.h
@@ -0,0 +1,22 @@
+// -*-C++-*-
+
+#ifndef FLOATTYPESS_H
+#define FLOATTYPESS_H
+
+#include <cstdint>
+
+namespace vecmathlib {
+
+ struct fp8 {
+ // 1 bit sign, 4 bits exponent, 3 bits mantissa
+ std::uint8_t val;
+ };
+
+ struct fp16 {
+ // 1 bit sign, 5 bits exponent, 10 bits mantissa
+ std::uint16_t val;
+ };
+
+} // namespace vecmathlib
+
+#endif // #ifndef FLOATTYPES_H
diff --git a/mathfuncs_fabs.h b/mathfuncs_fabs.h
index 1050147..952dbef 100644
--- a/mathfuncs_fabs.h
+++ b/mathfuncs_fabs.h
@@ -14,7 +14,7 @@ namespace vecmathlib {
template<typename realvec_t>
realvec_t mathfuncs<realvec_t>::vml_copysign(realvec_t x, realvec_t y)
{
- intvec_t value = as_int(x) & IV(~FP::signbit_mask);
+ intvec_t value = as_int(x) & IV(U(~FP::signbit_mask));
intvec_t sign = as_int(y) & IV(FP::signbit_mask);
return as_float(sign | value);
}
@@ -22,7 +22,7 @@ namespace vecmathlib {
template<typename realvec_t>
realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x)
{
- return as_float(as_int(x) & IV(~FP::signbit_mask));
+ return as_float(as_int(x) & IV(U(~FP::signbit_mask)));
}
template<typename realvec_t>
diff --git a/vec_fp16_avx.h b/vec_fp16_avx.h
new file mode 100644
index 0000000..b52d67c
--- /dev/null
+++ b/vec_fp16_avx.h
@@ -0,0 +1,579 @@
+// -*-C++-*-
+
+#ifndef VEC_FP16_AVX_H
+#define VEC_FP16_AVX_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// AVX intrinsics
+#include <immintrin.h>
+
+
+
+namespace vecmathlib {
+
+#define VECMATHLIB_HAVE_VEC_FP16_16
+ template<> struct boolvec<fp16,16>;
+ template<> struct intvec<fp16,16>;
+ template<> struct realvec<fp16,16>;
+
+
+
+ template<>
+ struct boolvec<fp16,16>: floatprops<fp16>
+ {
+ static int const size = 16;
+ typedef bool scalar_t;
+ typedef __m256i bvector_t;
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+ private:
+ // true values have the sign bit set, false values have it unset
+ static uint_t from_bool(bool a) { return - uint_t(a); }
+ static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+ public:
+
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x): v(x) {}
+ boolvec(bool a): v(_mm256_set1_epi16(from_bool(a))) {}
+ boolvec(bool const* as):
+ v(_mm256_set_epi16(from_bool(as[15]),
+ from_bool(as[14]),
+ from_bool(as[13]),
+ from_bool(as[12]),
+ from_bool(as[11]),
+ from_bool(as[10]),
+ from_bool(as[ 9]),
+ from_bool(as[ 8]),
+ from_bool(as[ 7]),
+ from_bool(as[ 6]),
+ from_bool(as[ 5]),
+ from_bool(as[ 4]),
+ from_bool(as[ 3]),
+ from_bool(as[ 2]),
+ from_bool(as[ 1]),
+ from_bool(as[ 0]))) {}
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+ boolvec& set_elt(int n, bool a)
+ {
+ return ((uint_t*)&v)[n]=from_bool(a), *this;
+ }
+
+
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+
+
+ boolvec operator!() const { return *this != boolvec(true); }
+
+ boolvec operator&&(boolvec x) const
+ {
+ return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+ _mm256_castsi256_ps(x.v)));
+ }
+ boolvec operator||(boolvec x) const
+ {
+ return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+ _mm256_castsi256_ps(x.v)));
+ }
+ boolvec operator==(boolvec x) const { return !(*this==x); }
+ boolvec operator!=(boolvec x) const
+ {
+ return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+ _mm256_castsi256_ps(x.v)));
+ }
+
+ bool all() const
+ {
+ bool r = true;
+ for (int n=0; n<size; ++n) r = r && (*this)[n];
+ return r;
+ }
+ bool any() const
+ {
+ bool r = false;
+ for (int n=0; n<size; ++n) r = r || (*this)[n];
+ return r;
+ }
+
+
+
+ // ifthen(condition, then-value, else-value)
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+ };
+
+
+
+ template<>
+ struct intvec<fp16,16>: floatprops<fp16>
+ {
+ static int const size = 16;
+ typedef int_t scalar_t;
+ typedef __m256i ivector_t;
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x): v(x) {}
+ intvec(int_t a): v(_mm256_set1_epi16(a)) {}
+ intvec(int_t const* as):
+ v(_mm256_set_epi16(as[15],
+ as[14],
+ as[13],
+ as[12],
+ as[11],
+ as[10],
+ as[ 9],
+ as[ 8],
+ as[ 7],
+ as[ 6],
+ as[ 5],
+ as[ 4],
+ as[ 3],
+ as[ 2],
+ as[ 1],
+ as[ 0])) {}
+ static intvec iota()
+ {
+ return _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0);
+ }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+ intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+
+
+
+ boolvec_t as_bool() const { return v; }
+ boolvec_t convert_bool() const
+ {
+ // Result: convert_bool(0)=false, convert_bool(else)=true
+ // There is no intrinsic to compare with zero. Instead, we check
+ // whether x is positive and x-1 is negative.
+ intvec x = *this;
+ // We know that boolvec values depend only on the sign bit
+ // return (~(x-1) | x).as_bool();
+ // return x.as_bool() || !(x-1).as_bool();
+ return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+ }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+
+
+ // Note: not all arithmetic operations are supported!
+
+ intvec operator+() const { return *this; }
+ intvec operator-() const { return IV(I(0)) - *this; }
+
+ intvec operator+(intvec x) const
+ {
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ __m128i xvlo = _mm256_castsi256_si128(x.v);
+ __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+ vlo = _mm_add_epi16(vlo, xvlo);
+ vhi = _mm_add_epi16(vhi, xvhi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ }
+ intvec operator-(intvec x) const
+ {
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ __m128i xvlo = _mm256_castsi256_si128(x.v);
+ __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+ vlo = _mm_sub_epi16(vlo, xvlo);
+ vhi = _mm_sub_epi16(vhi, xvhi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ }
+
+ intvec& operator+=(intvec const& x) { return *this=*this+x; }
+ intvec& operator-=(intvec const& x) { return *this=*this-x; }
+
+
+
+ intvec operator~() const { return IV(~U(0)) ^ *this; }
+
+ intvec operator&(intvec x) const
+ {
+ return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+ _mm256_castsi256_ps(x.v)));
+ }
+ intvec operator|(intvec x) const
+ {
+ return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+ _mm256_castsi256_ps(x.v)));
+ }
+ intvec operator^(intvec x) const
+ {
+ return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+ _mm256_castsi256_ps(x.v)));
+ }
+
+ intvec& operator&=(intvec const& x) { return *this=*this&x; }
+ intvec& operator|=(intvec const& x) { return *this=*this|x; }
+ intvec& operator^=(intvec const& x) { return *this=*this^x; }
+
+
+
+ intvec lsr(int_t n) const
+ {
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ vlo = _mm_srli_epi16(vlo, n);
+ vhi = _mm_srli_epi16(vhi, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ }
+ intvec operator>>(int_t n) const
+ {
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ vlo = _mm_srai_epi16(vlo, n);
+ vhi = _mm_srai_epi16(vhi, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ }
+ intvec operator<<(int_t n) const
+ {
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ vlo = _mm_slli_epi16(vlo, n);
+ vhi = _mm_slli_epi16(vhi, n);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ }
+ intvec& operator>>=(int_t n) { return *this=*this>>n; }
+ intvec& operator<<=(int_t n) { return *this=*this<<n; }
+
+ intvec lsr(intvec n) const
+ {
+ intvec r = *this;
+ for (int i=0; i<size; ++i) {
+ r.set_elt(i, U(r[i]) >> U(n[i]));
+ }
+ return r;
+ }
+ intvec operator>>(intvec n) const
+ {
+ intvec r = *this;
+ for (int i=0; i<size; ++i) {
+ r.set_elt(i, r[i] >> n[i]);
+ }
+ return r;
+ }
+ intvec operator<<(intvec n) const
+ {
+ intvec r = *this;
+ for (int i=0; i<size; ++i) {
+ r.set_elt(i, r[i] << n[i]);
+ }
+ return r;
+ }
+ intvec& operator>>=(intvec n) { return *this=*this>>n; }
+ intvec& operator<<=(intvec n) { return *this=*this<<n; }
+
+
+
+ boolvec_t operator==(intvec const& x) const
+ {
+ return ! (*this != x);
+ }
+ boolvec_t operator!=(intvec const& x) const
+ {
+ return (*this ^ x).convert_bool();
+ }
+ };
+
+
+
+ template<>
+ struct realvec<fp16,16>: floatprops<fp16>
+ {
+ static int const size = 16;
+ typedef real_t scalar_t;
+ typedef __m256i vector_t;
+
+ static char const* name() { return "<AVX:16*fp16>"; }
+ inline void barrier() { asm("": "+x" (v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x): v(x) {}
+ realvec(real_t a): v(_mm256_set1_epi16(FP::as_int(a))) {}
+ realvec(real_t const* as):
+ v(_mm256_set_epi16(FP::as_int(as[15]),
+ FP::as_int(as[14]),
+ FP::as_int(as[13]),
+ FP::as_int(as[12]),
+ FP::as_int(as[11]),
+ FP::as_int(as[10]),
+ FP::as_int(as[ 9]),
+ FP::as_int(as[ 8]),
+ FP::as_int(as[ 7]),
+ FP::as_int(as[ 6]),
+ FP::as_int(as[ 5]),
+ FP::as_int(as[ 4]),
+ FP::as_int(as[ 3]),
+ FP::as_int(as[ 2]),
+ FP::as_int(as[ 1]),
+ FP::as_int(as[ 0]))) {}
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+ realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+
+
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const* p)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ return _mm256_load_si256((__m256i const*)p);
+ }
+ static realvec_t loadu(real_t const* p)
+ {
+ return _mm256_loadu_si256((__m256i const*)p);
+ }
+ static realvec_t loadu(real_t const* p, size_t ioff)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return loada(p);
+ return loadu(p+ioff);
+ }
+ realvec_t loada(real_t const* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
+ }
+ }
+ realvec_t loadu(real_t const* p, mask_t const& m) const
+ {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
+ }
+ }
+ realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return loada(p, m);
+ return loadu(p+ioff, m);
+ }
+
+ void storea(real_t* p) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ _mm256_store_si256((__m256i*)p, v);
+ }
+ void storeu(real_t* p) const
+ {
+ return _mm256_storeu_si256((__m256i*)p, v);
+ }
+ void storeu(real_t* p, size_t ioff) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return storea(p);
+ storeu(p+ioff);
+ }
+ void storea(real_t* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ // TODO: this is expensive
+ for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+ }
+ }
+ void storeu(real_t* p, mask_t const& m) const
+ {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ // TODO: this is expensive
+ for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+ }
+ }
+ void storeu(real_t* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return storea(p, m);
+ storeu(p+ioff, m);
+ }
+
+
+
+ intvec_t as_int() const { return v; }
+ intvec_t convert_int() const { __builtin_unreachable(); }
+
+
+
+ realvec operator+() const { __builtin_unreachable(); }
+ realvec operator-() const { __builtin_unreachable(); }
+
+ realvec operator+(realvec x) const { __builtin_unreachable(); }
+ realvec operator-(realvec x) const { __builtin_unreachable(); }
+ realvec operator*(realvec x) const { __builtin_unreachable(); }
+ realvec operator/(realvec x) const { __builtin_unreachable(); }
+
+ realvec& operator+=(realvec const& x) { return *this=*this+x; }
+ realvec& operator-=(realvec const& x) { return *this=*this-x; }
+ realvec& operator*=(realvec const& x) { return *this=*this*x; }
+ realvec& operator/=(realvec const& x) { return *this=*this/x; }
+
+ real_t prod() const { __builtin_unreachable(); }
+ real_t sum() const { __builtin_unreachable(); }
+
+
+
+ boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
+ boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
+ boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
+ boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
+ boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
+ boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
+
+
+
+ realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+ realvec fabs() const { return MF::vml_fabs(*this); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const { return MF::vml_isnan(*this); }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ boolvec_t signbit() const { return v; }
+ };
+
+
+
+ // boolvec definitions
+
+ inline
+ auto boolvec<fp16,16>::as_int() const -> intvec_t
+ {
+ return v;
+ }
+
+ inline
+ auto boolvec<fp16,16>::convert_int() const -> intvec_t
+ {
+ return lsr(as_int(), bits-1);
+ }
+
+ inline
+ auto boolvec<fp16,16>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+ {
+ return ifthen(x.as_float(), y.as_float()).as_int();
+ }
+
+ inline
+ auto boolvec<fp16,16>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+ {
+ return (( -convert_int() & x.as_int()) |
+ (~-convert_int() & y.as_int())).as_float();
+ }
+
+
+
+ // intvec definitions
+
+ inline auto intvec<fp16,16>::as_float() const -> realvec_t
+ {
+ return v;
+ }
+
+ inline auto intvec<fp16,16>::convert_float() const -> realvec_t
+ {
+ __builtin_unreachable();
+ }
+
+} // namespace vecmathlib
+
+#endif // #ifndef VEC_FP16_AVX_H
diff --git a/vec_fp8_avx.h b/vec_fp8_avx.h
new file mode 100644
index 0000000..b612f42
--- /dev/null
+++ b/vec_fp8_avx.h
@@ -0,0 +1,645 @@
+// -*-C++-*-
+
+#ifndef VEC_FP8_AVX_H
+#define VEC_FP8_AVX_H
+
+#include "floatprops.h"
+#include "mathfuncs.h"
+#include "vec_base.h"
+
+#include <cmath>
+
+// AVX intrinsics
+#include <immintrin.h>
+
+
+
+namespace vecmathlib {
+
+#define VECMATHLIB_HAVE_VEC_FP8_32
+ template<> struct boolvec<fp8,32>;
+ template<> struct intvec<fp8,32>;
+ template<> struct realvec<fp8,32>;
+
+
+
+ template<>
+ struct boolvec<fp8,32>: floatprops<fp8>
+ {
+ static int const size = 32;
+ typedef bool scalar_t;
+ typedef __m256i bvector_t;
+
+ static_assert(size * sizeof(real_t) == sizeof(bvector_t),
+ "vector size is wrong");
+
+ private:
+ // true values have the sign bit set, false values have it unset
+ static uint_t from_bool(bool a) { return - uint_t(a); }
+ static bool to_bool(uint_t a) { return int_t(a) < int_t(0); }
+ public:
+
+ typedef boolvec boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+
+
+ bvector_t v;
+
+ boolvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // boolvec(boolvec const& x): v(x.v) {}
+ // boolvec& operator=(boolvec const& x) { return v=x.v, *this; }
+ boolvec(bvector_t x): v(x) {}
+ boolvec(bool a): v(_mm256_set1_epi8(from_bool(a))) {}
+ boolvec(bool const* as):
+ v(_mm256_set_epi8(from_bool(as[31]),
+ from_bool(as[30]),
+ from_bool(as[29]),
+ from_bool(as[28]),
+ from_bool(as[27]),
+ from_bool(as[26]),
+ from_bool(as[25]),
+ from_bool(as[24]),
+ from_bool(as[23]),
+ from_bool(as[22]),
+ from_bool(as[21]),
+ from_bool(as[20]),
+ from_bool(as[19]),
+ from_bool(as[18]),
+ from_bool(as[17]),
+ from_bool(as[16]),
+ from_bool(as[15]),
+ from_bool(as[14]),
+ from_bool(as[13]),
+ from_bool(as[12]),
+ from_bool(as[11]),
+ from_bool(as[10]),
+ from_bool(as[ 9]),
+ from_bool(as[ 8]),
+ from_bool(as[ 7]),
+ from_bool(as[ 6]),
+ from_bool(as[ 5]),
+ from_bool(as[ 4]),
+ from_bool(as[ 3]),
+ from_bool(as[ 2]),
+ from_bool(as[ 1]),
+ from_bool(as[ 0]))) {}
+
+ operator bvector_t() const { return v; }
+ bool operator[](int n) const { return to_bool(((uint_t const*)&v)[n]); }
+ boolvec& set_elt(int n, bool a)
+ {
+ return ((uint_t*)&v)[n]=from_bool(a), *this;
+ }
+
+
+
+ intvec_t as_int() const; // defined after intvec
+ intvec_t convert_int() const; // defined after intvec
+
+
+
+ boolvec operator!() const { return *this != boolvec(true); }
+
+ boolvec operator&&(boolvec x) const
+ {
+ return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+ _mm256_castsi256_ps(x.v)));
+ }
+ boolvec operator||(boolvec x) const
+ {
+ return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+ _mm256_castsi256_ps(x.v)));
+ }
+ boolvec operator==(boolvec x) const { return !(*this==x); }
+ boolvec operator!=(boolvec x) const
+ {
+ return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+ _mm256_castsi256_ps(x.v)));
+ }
+
+ bool all() const
+ {
+ bool r = true;
+ for (int n=0; n<size; ++n) r = r && (*this)[n];
+ return r;
+ }
+ bool any() const
+ {
+ bool r = false;
+ for (int n=0; n<size; ++n) r = r || (*this)[n];
+ return r;
+ }
+
+
+
+ // ifthen(condition, then-value, else-value)
+ intvec_t ifthen(intvec_t x, intvec_t y) const; // defined after intvec
+ realvec_t ifthen(realvec_t x, realvec_t y) const; // defined after realvec
+ };
+
+
+
+ template<>
+ struct intvec<fp8,32>: floatprops<fp8>
+ {
+ static int const size = 32;
+ typedef int_t scalar_t;
+ typedef __m256i ivector_t;
+
+ static_assert(size * sizeof(real_t) == sizeof(ivector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec intvec_t;
+ typedef realvec<real_t, size> realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+
+
+ ivector_t v;
+
+ intvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // intvec(intvec const& x): v(x.v) {}
+ // intvec& operator=(intvec const& x) { return v=x.v, *this; }
+ intvec(ivector_t x): v(x) {}
+ intvec(int_t a): v(_mm256_set1_epi8(a)) {}
+ intvec(int_t const* as):
+ v(_mm256_set_epi8(as[31],
+ as[30],
+ as[29],
+ as[28],
+ as[27],
+ as[26],
+ as[25],
+ as[24],
+ as[23],
+ as[22],
+ as[21],
+ as[20],
+ as[19],
+ as[18],
+ as[17],
+ as[16],
+ as[15],
+ as[14],
+ as[13],
+ as[12],
+ as[11],
+ as[10],
+ as[ 9],
+ as[ 8],
+ as[ 7],
+ as[ 6],
+ as[ 5],
+ as[ 4],
+ as[ 3],
+ as[ 2],
+ as[ 1],
+ as[ 0])) {}
+ static intvec iota()
+ {
+ return _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24,
+ 23, 22, 21, 20, 19, 18, 17, 16,
+ 15, 14, 13, 12, 11, 10, 9, 8,
+ 7, 6, 5, 4, 3, 2, 1, 0);
+ }
+
+ operator ivector_t() const { return v; }
+ int_t operator[](int n) const { return ((int_t const*)&v)[n]; }
+ intvec& set_elt(int n, int_t a) { return ((int_t*)&v)[n]=a, *this; }
+
+
+
+ boolvec_t as_bool() const { return v; }
+ boolvec_t convert_bool() const
+ {
+ // Result: convert_bool(0)=false, convert_bool(else)=true
+ // There is no intrinsic to compare with zero. Instead, we check
+ // whether x is positive and x-1 is negative.
+ intvec x = *this;
+ // We know that boolvec values depend only on the sign bit
+ // return (~(x-1) | x).as_bool();
+ // return x.as_bool() || !(x-1).as_bool();
+ return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool();
+ }
+ realvec_t as_float() const; // defined after realvec
+ realvec_t convert_float() const; // defined after realvec
+
+
+
+ // Note: not all arithmetic operations are supported!
+
+ intvec operator+() const { return *this; }
+ intvec operator-() const { return IV(I(0)) - *this; }
+
+ intvec operator+(intvec x) const
+ {
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ __m128i xvlo = _mm256_castsi256_si128(x.v);
+ __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+ vlo = _mm_add_epi8(vlo, xvlo);
+ vhi = _mm_add_epi8(vhi, xvhi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ }
+ intvec operator-(intvec x) const
+ {
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ __m128i xvlo = _mm256_castsi256_si128(x.v);
+ __m128i xvhi = _mm256_extractf128_si256(x.v, 1);
+ vlo = _mm_sub_epi8(vlo, xvlo);
+ vhi = _mm_sub_epi8(vhi, xvhi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ }
+
+ intvec& operator+=(intvec const& x) { return *this=*this+x; }
+ intvec& operator-=(intvec const& x) { return *this=*this-x; }
+
+
+
+ intvec operator~() const { return IV(~U(0)) ^ *this; }
+
+ intvec operator&(intvec x) const
+ {
+ return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
+ _mm256_castsi256_ps(x.v)));
+ }
+ intvec operator|(intvec x) const
+ {
+ return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
+ _mm256_castsi256_ps(x.v)));
+ }
+ intvec operator^(intvec x) const
+ {
+ return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
+ _mm256_castsi256_ps(x.v)));
+ }
+
+ intvec& operator&=(intvec const& x) { return *this=*this&x; }
+ intvec& operator|=(intvec const& x) { return *this=*this|x; }
+ intvec& operator^=(intvec const& x) { return *this=*this^x; }
+
+
+
+ intvec lsr(int_t n) const
+ {
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ uint_t masklo = U(0x00ffU) >> U(n);
+ uint_t maskhi = U(0xff00U);
+ __m128i mask = _mm_set1_epi16(masklo | maskhi);
+ vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask);
+ vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ }
+ intvec operator>>(int_t n) const
+ {
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ uint_t masklo = U(0x00ffU);
+ uint_t maskhi = U(0xff00U);
+ __m128i vlolo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vlo, 8), n+8),
+ _mm_set1_epi16(masklo));
+ __m128i vlohi = _mm_and_si128(_mm_srai_epi16(vlo, n),
+ _mm_set1_epi16(maskhi));
+ vlo = _mm_or_si128(vlolo, vlohi);
+ __m128i vhilo = _mm_and_si128(_mm_srai_epi16(_mm_slli_epi16(vhi, 8), n+8),
+ _mm_set1_epi16(masklo));
+ __m128i vhihi = _mm_and_si128(_mm_srai_epi16(vhi, n),
+ _mm_set1_epi16(maskhi));
+ vhi = _mm_or_si128(vhilo, vhihi);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ }
+ intvec operator<<(int_t n) const
+ {
+ __m128i vlo = _mm256_castsi256_si128(v);
+ __m128i vhi = _mm256_extractf128_si256(v, 1);
+ uint_t masklo = U(0x00ffU);
+ uint_t maskhi = U(0xff00U) << U(n);
+ __m128i mask = _mm_set1_epi16(masklo | maskhi);
+ vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask);
+ vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+ }
+ intvec& operator>>=(int_t n) { return *this=*this>>n; }
+ intvec& operator<<=(int_t n) { return *this=*this<<n; }
+
+ intvec lsr(intvec n) const
+ {
+ intvec r = *this;
+ for (int i=0; i<size; ++i) {
+ r.set_elt(i, U(r[i]) >> U(n[i]));
+ }
+ return r;
+ }
+ intvec operator>>(intvec n) const
+ {
+ intvec r = *this;
+ for (int i=0; i<size; ++i) {
+ r.set_elt(i, r[i] >> n[i]);
+ }
+ return r;
+ }
+ intvec operator<<(intvec n) const
+ {
+ intvec r = *this;
+ for (int i=0; i<size; ++i) {
+ r.set_elt(i, r[i] << n[i]);
+ }
+ return r;
+ }
+ intvec& operator>>=(intvec n) { return *this=*this>>n; }
+ intvec& operator<<=(intvec n) { return *this=*this<<n; }
+
+
+
+ boolvec_t operator==(intvec const& x) const
+ {
+ return ! (*this != x);
+ }
+ boolvec_t operator!=(intvec const& x) const
+ {
+ return (*this ^ x).convert_bool();
+ }
+ };
+
+
+
+ template<>
+ struct realvec<fp8,32>: floatprops<fp8>
+ {
+ static int const size = 32;
+ typedef real_t scalar_t;
+ typedef __m256i vector_t;
+
+ static char const* name() { return "<AVX:32*fp8>"; }
+ inline void barrier() { asm("": "+x" (v)); }
+
+ static_assert(size * sizeof(real_t) == sizeof(vector_t),
+ "vector size is wrong");
+
+ typedef boolvec<real_t, size> boolvec_t;
+ typedef intvec<real_t, size> intvec_t;
+ typedef realvec realvec_t;
+
+ // Short names for type casts
+ typedef real_t R;
+ typedef int_t I;
+ typedef uint_t U;
+ typedef realvec_t RV;
+ typedef intvec_t IV;
+ typedef boolvec_t BV;
+ typedef floatprops<real_t> FP;
+ typedef mathfuncs<realvec_t> MF;
+
+
+
+ vector_t v;
+
+ realvec() {}
+ // Can't have a non-trivial copy constructor; if so, objects won't
+ // be passed in registers
+ // realvec(realvec const& x): v(x.v) {}
+ // realvec& operator=(realvec const& x) { return v=x.v, *this; }
+ realvec(vector_t x): v(x) {}
+ realvec(real_t a): v(_mm256_set1_epi8(FP::as_int(a))) {}
+ realvec(real_t const* as):
+ v(_mm256_set_epi8(FP::as_int(as[31]),
+ FP::as_int(as[30]),
+ FP::as_int(as[29]),
+ FP::as_int(as[28]),
+ FP::as_int(as[27]),
+ FP::as_int(as[26]),
+ FP::as_int(as[25]),
+ FP::as_int(as[24]),
+ FP::as_int(as[23]),
+ FP::as_int(as[22]),
+ FP::as_int(as[21]),
+ FP::as_int(as[20]),
+ FP::as_int(as[19]),
+ FP::as_int(as[18]),
+ FP::as_int(as[17]),
+ FP::as_int(as[16]),
+ FP::as_int(as[15]),
+ FP::as_int(as[14]),
+ FP::as_int(as[13]),
+ FP::as_int(as[12]),
+ FP::as_int(as[11]),
+ FP::as_int(as[10]),
+ FP::as_int(as[ 9]),
+ FP::as_int(as[ 8]),
+ FP::as_int(as[ 7]),
+ FP::as_int(as[ 6]),
+ FP::as_int(as[ 5]),
+ FP::as_int(as[ 4]),
+ FP::as_int(as[ 3]),
+ FP::as_int(as[ 2]),
+ FP::as_int(as[ 1]),
+ FP::as_int(as[ 0]))) {}
+
+ operator vector_t() const { return v; }
+ real_t operator[](int n) const { return ((real_t const*)&v)[n]; }
+ realvec& set_elt(int n, real_t a) { return ((real_t*)&v)[n]=a, *this; }
+
+
+
+ typedef vecmathlib::mask_t<realvec_t> mask_t;
+
+ static realvec_t loada(real_t const* p)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ return _mm256_load_si256((__m256i const*)p);
+ }
+ static realvec_t loadu(real_t const* p)
+ {
+ return _mm256_loadu_si256((__m256i const*)p);
+ }
+ static realvec_t loadu(real_t const* p, size_t ioff)
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return loada(p);
+ return loadu(p+ioff);
+ }
+ realvec_t loada(real_t const* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (__builtin_expect(all(m.m), true)) {
+ return loada(p);
+ } else {
+ return m.m.ifthen(loada(p), *this);
+ }
+ }
+ realvec_t loadu(real_t const* p, mask_t const& m) const
+ {
+ if (__builtin_expect(m.all_m, true)) {
+ return loadu(p);
+ } else {
+ return m.m.ifthen(loadu(p), *this);
+ }
+ }
+ realvec_t loadu(real_t const* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return loada(p, m);
+ return loadu(p+ioff, m);
+ }
+
+ void storea(real_t* p) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ _mm256_store_si256((__m256i*)p, v);
+ }
+ void storeu(real_t* p) const
+ {
+ return _mm256_storeu_si256((__m256i*)p, v);
+ }
+ void storeu(real_t* p, size_t ioff) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return storea(p);
+ storeu(p+ioff);
+ }
+ void storea(real_t* p, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (__builtin_expect(m.all_m, true)) {
+ storea(p);
+ } else {
+ // TODO: this is expensive
+ for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+ }
+ }
+ void storeu(real_t* p, mask_t const& m) const
+ {
+ if (__builtin_expect(m.all_m, true)) {
+ storeu(p);
+ } else {
+ // TODO: this is expensive
+ for (int n=0; n<size; ++n) if (m.m[n]) p[n] = (*this)[n];
+ }
+ }
+ void storeu(real_t* p, size_t ioff, mask_t const& m) const
+ {
+ VML_ASSERT(intptr_t(p) % sizeof(realvec_t) == 0);
+ if (ioff==0) return storea(p, m);
+ storeu(p+ioff, m);
+ }
+
+
+
+ intvec_t as_int() const { return v; }
+ intvec_t convert_int() const { __builtin_unreachable(); }
+
+
+
+ realvec operator+() const { __builtin_unreachable(); }
+ realvec operator-() const { __builtin_unreachable(); }
+
+ realvec operator+(realvec x) const { __builtin_unreachable(); }
+ realvec operator-(realvec x) const { __builtin_unreachable(); }
+ realvec operator*(realvec x) const { __builtin_unreachable(); }
+ realvec operator/(realvec x) const { __builtin_unreachable(); }
+
+ realvec& operator+=(realvec const& x) { return *this=*this+x; }
+ realvec& operator-=(realvec const& x) { return *this=*this-x; }
+ realvec& operator*=(realvec const& x) { return *this=*this*x; }
+ realvec& operator/=(realvec const& x) { return *this=*this/x; }
+
+ real_t prod() const { __builtin_unreachable(); }
+ real_t sum() const { __builtin_unreachable(); }
+
+
+
+ boolvec_t operator==(realvec const& x) const { __builtin_unreachable(); }
+ boolvec_t operator!=(realvec const& x) const { __builtin_unreachable(); }
+ boolvec_t operator<(realvec const& x) const { __builtin_unreachable(); }
+ boolvec_t operator<=(realvec const& x) const { __builtin_unreachable(); }
+ boolvec_t operator>(realvec const& x) const { __builtin_unreachable(); }
+ boolvec_t operator>=(realvec const& x) const { __builtin_unreachable(); }
+
+
+
+ realvec copysign(realvec y) const { return MF::vml_copysign(*this, y); }
+ realvec fabs() const { return MF::vml_fabs(*this); }
+ intvec_t ilogb() const { return MF::vml_ilogb(*this); }
+ boolvec_t isfinite() const { return MF::vml_isfinite(*this); }
+ boolvec_t isinf() const { return MF::vml_isinf(*this); }
+ boolvec_t isnan() const { return MF::vml_isnan(*this); }
+ boolvec_t isnormal() const { return MF::vml_isnormal(*this); }
+ realvec ldexp(int_t n) const { return MF::vml_ldexp(*this, n); }
+ realvec ldexp(intvec_t n) const { return MF::vml_ldexp(*this, n); }
+ boolvec_t signbit() const { return v; }
+ };
+
+
+
+ // boolvec definitions
+
+ inline
+ auto boolvec<fp8,32>::as_int() const -> intvec_t
+ {
+ return v;
+ }
+
+ inline
+ auto boolvec<fp8,32>::convert_int() const -> intvec_t
+ {
+ return lsr(as_int(), bits-1);
+ }
+
+ inline
+ auto boolvec<fp8,32>::ifthen(intvec_t x, intvec_t y) const -> intvec_t
+ {
+ return ifthen(x.as_float(), y.as_float()).as_int();
+ }
+
+ inline
+ auto boolvec<fp8,32>::ifthen(realvec_t x, realvec_t y) const -> realvec_t
+ {
+ return (( -convert_int() & x.as_int()) |
+ (~-convert_int() & y.as_int())).as_float();
+ }
+
+
+
+ // intvec definitions
+
+ inline auto intvec<fp8,32>::as_float() const -> realvec_t
+ {
+ return v;
+ }
+
+ inline auto intvec<fp8,32>::convert_float() const -> realvec_t
+ {
+ __builtin_unreachable();
+ }
+
+} // namespace vecmathlib
+
+#endif // #ifndef VEC_FP8_AVX_H
diff --git a/vecmathlib.h b/vecmathlib.h
index 451a967..abfeda3 100644
--- a/vecmathlib.h
+++ b/vecmathlib.h
@@ -37,20 +37,16 @@
// Intel SSE 2
#if defined __SSE2__
# include "vec_float_sse2_scalar.h"
-# include "vec_float_sse2.h"
-#endif
-// Intel AVX
-#if defined __AVX__
-# include "vec_float_avx.h"
-#endif
-
-// Intel SSE 2
-#if defined __SSE2__
# include "vec_double_sse2_scalar.h"
+# include "vec_float_sse2.h"
# include "vec_double_sse2.h"
#endif
+
// Intel AVX
#if defined __AVX__
+# include "vec_fp8_avx.h"
+# include "vec_fp16_avx.h"
+# include "vec_float_avx.h"
# include "vec_double_avx.h"
#endif
OpenPOWER on IntegriCloud