diff options
author | Erik Schnetter <schnetter@gmail.com> | 2015-09-03 12:57:38 -0400 |
---|---|---|
committer | Erik Schnetter <schnetter@gmail.com> | 2015-09-03 12:57:38 -0400 |
commit | 236bd3ff4ca26e77ee74bb03aeef0a82fb946a2e (patch) | |
tree | ac491a14388777e0f2f9714b6d0a1c4ef6e50777 | |
parent | 1ab77ee14f6657f2aaf1275a6fc13dba9471db4f (diff) | |
download | vecmathlib-236bd3ff4ca26e77ee74bb03aeef0a82fb946a2e.zip vecmathlib-236bd3ff4ca26e77ee74bb03aeef0a82fb946a2e.tar.gz |
Use AVX2 intrinsics if available
-rw-r--r-- | vec_avx_double4.h | 76 | ||||
-rw-r--r-- | vec_avx_float8.h | 74 | ||||
-rw-r--r-- | vec_avx_fp16_16.h | 111 | ||||
-rw-r--r-- | vec_avx_fp8_32.h | 120 | ||||
-rw-r--r-- | vec_sse_double2.h | 2 |
5 files changed, 356 insertions, 27 deletions
diff --git a/vec_avx_double4.h b/vec_avx_double4.h index 2699a43..e949b2e 100644 --- a/vec_avx_double4.h +++ b/vec_avx_double4.h @@ -173,13 +173,17 @@ namespace vecmathlib { boolvec_t convert_bool() const { // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare with zero. Instead, we check +#ifdef __AVX2__ + return _mm256_castsi256_pd(_mm256_cmpeq_epi64(v, _mm256_setzero_si256())); +#else + // There is no intrinsic to compare to zero. Instead, we check // whether x is positive and x-1 is negative. intvec_t x = *this; // We know that boolvec_t values depend only on the sign bit // return (~(x-1) | x).as_bool(); // return x.as_bool() || !(x-1).as_bool(); return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); +#endif } realvec_t as_float() const; // defined after realvec realvec_t convert_float() const; // defined after realvec @@ -193,6 +197,9 @@ namespace vecmathlib { intvec_t operator+(intvec_t x) const { +#ifdef __AVX2__ + return _mm256_add_epi64(v, x.v); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); __m128i xvlo = _mm256_castsi256_si128(x.v); @@ -200,9 +207,13 @@ namespace vecmathlib { vlo = _mm_add_epi64(vlo, xvlo); vhi = _mm_add_epi64(vhi, xvhi); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec_t operator-(intvec_t x) const { +#ifdef __AVX2__ + return _mm256_sub_epi64(v, x.v); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); __m128i xvlo = _mm256_castsi256_si128(x.v); @@ -210,6 +221,7 @@ namespace vecmathlib { vlo = _mm_sub_epi64(vlo, xvlo); vhi = _mm_sub_epi64(vhi, xvhi); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; } @@ -221,18 +233,30 @@ namespace vecmathlib { intvec_t operator&(intvec_t x) const { +#ifdef __AVX2__ + return _mm256_and_si256(v, x.v); +#else return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v))); +#endif } intvec_t operator|(intvec_t x) const { +#ifdef __AVX2__ + return _mm256_or_si256(v, x.v); +#else return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v))); +#endif } intvec_t operator^(intvec_t x) const { +#ifdef __AVX2__ + return _mm256_xor_si256(v, x.v); +#else return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(v), _mm256_castsi256_pd(x.v))); +#endif } intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; } @@ -245,15 +269,26 @@ namespace vecmathlib { intvec_t lsr(int_t n) const { +#ifdef __AVX2__ + return _mm256_srli_epi64(v, n); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); vlo = _mm_srli_epi64(vlo, n); vhi = _mm_srli_epi64(vhi, n); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec_t rotate(int_t n) const; intvec_t operator>>(int_t n) const { +#ifdef __AVX2__ + // There is no _mm256_srai_epi64. To emulate it, add 0x80000000 + // before shifting, and subtract the shifted 0x80000000 after + // shifting + intvec_t offset = U(1) << (bits-1); + return (*this + offset).lsr(n) - offset.lsr(n); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); // There is no _mm_srai_epi64. To emulate it, add 0x80000000 @@ -282,42 +317,61 @@ namespace vecmathlib { vhi = _mm_sub_epi64(vhi, _mm_set1_epi64x(U(1) << (bits-1-n))); #endif return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec_t operator<<(int_t n) const { +#ifdef __AVX2__ + return _mm256_slli_epi64(v, n); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); vlo = _mm_slli_epi64(vlo, n); vhi = _mm_slli_epi64(vhi, n); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec_t& operator>>=(int_t n) { return *this=*this>>n; } intvec_t& operator<<=(int_t n) { return *this=*this<<n; } intvec_t lsr(intvec_t n) const { +#ifdef __AVX2__ + return _mm256_srlv_epi64(v, n.v); +#else intvec_t r; for (int i=0; i<size; ++i) { r.set_elt(i, U((*this)[i]) >> U(n[i])); } return r; +#endif } intvec_t rotate(intvec_t n) const; intvec_t operator>>(intvec_t n) const { +#ifdef __AVX2__ + // See operator>> above + intvec_t offset = U(1) << (bits-1); + return (*this + offset).lsr(n) - offset.lsr(n); +#else intvec_t r; for (int i=0; i<size; ++i) { r.set_elt(i, (*this)[i] >> n[i]); } return r; +#endif } intvec_t operator<<(intvec_t n) const { +#ifdef __AVX2__ + return _mm256_sllv_epi64(v, n.v); +#else intvec_t r; for (int i=0; i<size; ++i) { r.set_elt(i, (*this)[i] << n[i]); } return r; +#endif } intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; } intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; } @@ -329,20 +383,32 @@ namespace vecmathlib { boolvec_t operator==(intvec_t const& x) const { +#ifdef __AVX2__ + return _mm256_castsi256_pd(_mm256_cmpeq_epi64(v, x.v)); +#else return ! (*this != x); +#endif } boolvec_t operator!=(intvec_t const& x) const { +#ifdef __AVX2__ + return ! (*this == x); +#else return (*this ^ x).convert_bool(); +#endif } boolvec_t operator<(intvec_t const& x) const { +#ifdef __AVX2__ + return _mm256_castsi256_pd(_mm256_cmpgt_epi64(x.v, v)); +#else // return (*this - x).as_bool(); boolvec_t r; for (int i=0; i<size; ++i) { r.set_elt(i, (*this)[i] < x[i]); } return r; +#endif } boolvec_t operator<=(intvec_t const& x) const { @@ -373,7 +439,13 @@ namespace vecmathlib { typedef __m256d vector_t; static int const alignment = sizeof(vector_t); - static char const* name() { return "<AVX:4*double>"; } + static char const* name() { +#ifdef __AVX2__ + return "<AVX2:4*double>"; +#else + return "<AVX:4*double>"; +#endif + } void barrier() { __asm__("": "+x"(v)); } static_assert(size * sizeof(real_t) == sizeof(vector_t), diff --git a/vec_avx_float8.h b/vec_avx_float8.h index bba77cb..2b1e386 100644 --- a/vec_avx_float8.h +++ b/vec_avx_float8.h @@ -182,13 +182,17 @@ namespace vecmathlib { boolvec_t convert_bool() const { // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare with zero. Instead, we check +#ifdef __AVX2__ + return _mm256_castsi256_ps(_mm256_cmpeq_epi32(v, _mm256_setzero_si256())); +#else + // There is no intrinsic to compare to zero. Instead, we check // whether x is positive and x-1 is negative. intvec_t x = *this; // We know that boolvec_t values depend only on the sign bit // return (~(x-1) | x).as_bool(); // return x.as_bool() || !(x-1).as_bool(); return x.as_bool() || (x + (FP::signbit_mask - 1)).as_bool(); +#endif } realvec_t as_float() const; // defined after realvec realvec_t convert_float() const; // defined after realvec @@ -202,6 +206,9 @@ namespace vecmathlib { intvec_t operator+(intvec_t x) const { +#ifdef __AVX2__ + return _mm256_add_epi32(v, x.v); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); __m128i xvlo = _mm256_castsi256_si128(x.v); @@ -209,9 +216,13 @@ namespace vecmathlib { vlo = _mm_add_epi32(vlo, xvlo); vhi = _mm_add_epi32(vhi, xvhi); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec_t operator-(intvec_t x) const { +#ifdef __AVX2__ + return _mm256_sub_epi32(v, x.v); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); __m128i xvlo = _mm256_castsi256_si128(x.v); @@ -219,6 +230,7 @@ namespace vecmathlib { vlo = _mm_sub_epi32(vlo, xvlo); vhi = _mm_sub_epi32(vhi, xvhi); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec_t& operator+=(intvec_t const& x) { return *this=*this+x; } @@ -230,18 +242,30 @@ namespace vecmathlib { intvec_t operator&(intvec_t x) const { +#ifdef __AVX2__ + return _mm256_and_si256(v, x.v); +#else return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); +#endif } intvec_t operator|(intvec_t x) const { +#ifdef __AVX2__ + return _mm256_or_si256(v, x.v); +#else return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); +#endif } intvec_t operator^(intvec_t x) const { +#ifdef __AVX2__ + return _mm256_xor_si256(v, x.v); +#else return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); +#endif } intvec_t& operator&=(intvec_t const& x) { return *this=*this&x; } @@ -254,56 +278,80 @@ namespace vecmathlib { intvec_t lsr(int_t n) const { +#ifdef __AVX2__ + return _mm256_srli_epi32(v, n); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); vlo = _mm_srli_epi32(vlo, n); vhi = _mm_srli_epi32(vhi, n); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec_t rotate(int_t n) const; intvec_t operator>>(int_t n) const { +#ifdef __AVX2__ + return _mm256_srai_epi32(v, n); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); vlo = _mm_srai_epi32(vlo, n); vhi = _mm_srai_epi32(vhi, n); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec_t operator<<(int_t n) const { +#ifdef __AVX2__ + return _mm256_slli_epi32(v, n); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); vlo = _mm_slli_epi32(vlo, n); vhi = _mm_slli_epi32(vhi, n); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec_t& operator>>=(int_t n) { return *this=*this>>n; } intvec_t& operator<<=(int_t n) { return *this=*this<<n; } intvec_t lsr(intvec_t n) const { +#ifdef __AVX2__ + return _mm256_srlv_epi32(v, n.v); +#else intvec_t r; for (int i=0; i<size; ++i) { r.set_elt(i, U((*this)[i]) >> U(n[i])); } return r; +#endif } intvec_t rotate(intvec_t n) const; intvec_t operator>>(intvec_t n) const { +#ifdef __AVX2__ + return _mm256_srav_epi32(v, n.v); +#else intvec_t r; for (int i=0; i<size; ++i) { r.set_elt(i, (*this)[i] >> n[i]); } return r; +#endif } intvec_t operator<<(intvec_t n) const { +#ifdef __AVX2__ + return _mm256_sllv_epi32(v, n.v); +#else intvec_t r; for (int i=0; i<size; ++i) { r.set_elt(i, (*this)[i] << n[i]); } return r; +#endif } intvec_t& operator>>=(intvec_t n) { return *this=*this>>n; } intvec_t& operator<<=(intvec_t n) { return *this=*this<<n; } @@ -315,20 +363,32 @@ namespace vecmathlib { boolvec_t operator==(intvec_t const& x) const { +#ifdef __AVX2__ + return _mm256_castsi256_ps(_mm256_cmpeq_epi32(v, x.v)); +#else return ! (*this != x); +#endif } boolvec_t operator!=(intvec_t const& x) const { +#ifdef __AVX2__ + return ! (*this == x); +#else return (*this ^ x).convert_bool(); +#endif } boolvec_t operator<(intvec_t const& x) const { +#ifdef __AVX2__ + return _mm256_castsi256_ps(_mm256_cmpgt_epi32(x.v, v)); +#else // return (*this - x).as_bool(); boolvec_t r; for (int i=0; i<size; ++i) { r.set_elt(i, (*this)[i] < x[i]); } return r; +#endif } boolvec_t operator<=(intvec_t const& x) const { @@ -359,7 +419,13 @@ namespace vecmathlib { typedef __m256 vector_t; static int const alignment = sizeof(vector_t); - static char const* name() { return "<AVX:8*float>"; } + static char const* name() { +#ifdef __AVX2__ + return "<AVX2:8*float>"; +#else + return "<AVX:8*float>"; +#endif + } void barrier() { __asm__("": "+x"(v)); } static_assert(size * sizeof(real_t) == sizeof(vector_t), @@ -715,7 +781,11 @@ namespace vecmathlib { inline intvec<float,8> intvec<float,8>::abs() const { +#ifdef __AVX2__ + return _mm256_abs_epi32(v); +#else return MF::vml_abs(*this); +#endif } inline realvec<float,8> intvec<float,8>::as_float() const diff --git a/vec_avx_fp16_16.h b/vec_avx_fp16_16.h index e461ce4..20cd5ef 100644 --- a/vec_avx_fp16_16.h +++ b/vec_avx_fp16_16.h @@ -218,7 +218,7 @@ namespace vecmathlib { boolvec_t convert_bool() const { // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare with zero. Instead, we check + // There is no intrinsic to compare to zero. Instead, we check // whether x is positive and x-1 is negative. intvec x = *this; // We know that boolvec values depend only on the sign bit @@ -238,6 +238,9 @@ namespace vecmathlib { intvec operator+(intvec x) const { +#ifdef __AVX2__ + return _mm256_add_epi16(v, x.v); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); __m128i xvlo = _mm256_castsi256_si128(x.v); @@ -245,9 +248,13 @@ namespace vecmathlib { vlo = _mm_add_epi16(vlo, xvlo); vhi = _mm_add_epi16(vhi, xvhi); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec operator-(intvec x) const { +#ifdef __AVX2__ + return _mm256_sub_epi16(v, x.v); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); __m128i xvlo = _mm256_castsi256_si128(x.v); @@ -255,6 +262,7 @@ namespace vecmathlib { vlo = _mm_sub_epi16(vlo, xvlo); vhi = _mm_sub_epi16(vhi, xvhi); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec& operator+=(intvec const& x) { return *this=*this+x; } @@ -266,18 +274,30 @@ namespace vecmathlib { intvec operator&(intvec x) const { +#ifdef __AVX2__ + return _mm256_and_si256(v, x.v); +#else return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); +#endif } intvec operator|(intvec x) const { +#ifdef __AVX2__ + return _mm256_or_si256(v, x.v); +#else return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); +#endif } intvec operator^(intvec x) const { +#ifdef __AVX2__ + return _mm256_xor_si256(v, x.v); +#else return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); +#endif } intvec& operator&=(intvec const& x) { return *this=*this&x; } @@ -288,27 +308,39 @@ namespace vecmathlib { intvec lsr(int_t n) const { +#ifdef __AVX2__ + return _mm256_srli_epi16(v, n); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); vlo = _mm_srli_epi16(vlo, n); vhi = _mm_srli_epi16(vhi, n); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec operator>>(int_t n) const { +#ifdef __AVX2__ + return _mm256_srai_epi16(v, n); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); vlo = _mm_srai_epi16(vlo, n); vhi = _mm_srai_epi16(vhi, n); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec operator<<(int_t n) const { +#ifdef __AVX2__ + return _mm256_slli_epi16(v, n); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); vlo = _mm_slli_epi16(vlo, n); vhi = _mm_slli_epi16(vhi, n); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec& operator>>=(int_t n) { return *this=*this>>n; } intvec& operator<<=(int_t n) { return *this=*this<<n; } @@ -344,18 +376,51 @@ namespace vecmathlib { boolvec_t operator==(intvec const& x) const { +#ifdef __AVX2__ + return _mm256_cmpeq_epi16(v, x.v); +#else return ! (*this != x); +#endif } boolvec_t operator!=(intvec const& x) const { +#ifdef __AVX2__ + return ! (*this == x); +#else return (*this ^ x).convert_bool(); +#endif } - // TODO: First compare sign; then if equal, compare sign of difference - // TODO: Also look for intrinsics - boolvec_t operator<(intvec const& x) const { __builtin_unreachable(); } - boolvec_t operator<=(intvec const& x) const { __builtin_unreachable(); } - boolvec_t operator>(intvec const& x) const { __builtin_unreachable(); } - boolvec_t operator>=(intvec const& x) const { __builtin_unreachable(); } + boolvec_t operator<(intvec const& x) const + { +#ifdef __AVX2__ + return _mm256_cmpgt_epi16(x.v, v); +#else + // TODO: First compare sign; then if equal, compare sign of difference + // TODO: Also look for intrinsics + boolvec_t r; + for (int i=0; i<size; ++i) { + r.set_elt(i, (*this)[i] < x[i]); + } + return r; +#endif + } + boolvec_t operator<=(intvec_t const& x) const + { + return ! (*this > x); + } + boolvec_t operator>(intvec_t const& x) const + { + return x < *this; + } + boolvec_t operator>=(intvec_t const& x) const + { + return ! (*this < x); + } + + intvec_t abs() const; + boolvec_t isignbit() const { return as_bool(); } + intvec_t max(intvec_t x) const; + intvec_t min(intvec_t x) const; }; @@ -368,7 +433,13 @@ namespace vecmathlib { typedef __m256i vector_t; static int const alignment = sizeof(vector_t); - static char const* name() { return "<AVX:16*fp16>"; } + static char const* name() { +#ifdef __AVX2__ + return "<AVX2:16*fp16>"; +#else + return "<AVX:16*fp16>"; +#endif + } void barrier() { __asm__("": "+x"(v)); } static_assert(size * sizeof(real_t) == sizeof(vector_t), @@ -581,20 +652,28 @@ namespace vecmathlib { inline intvec<fp16,16> boolvec<fp16,16>::ifthen(intvec_t x, intvec_t y) const { - return ifthen(x.as_float(), y.as_float()).as_int(); + return (( -convert_int() & x) | (~-convert_int() & y)); } inline realvec<fp16,16> boolvec<fp16,16>::ifthen(realvec_t x, realvec_t y) const { - return (( -convert_int() & x.as_int()) | - (~-convert_int() & y.as_int())).as_float(); + return ifthen(x.as_int(), y.as_int()).as_float(); } // intvec definitions + inline intvec<fp16,16> intvec<fp16,16>::abs() const + { +#ifdef __AVX2__ + return _mm256_abs_epi16(v); +#else + return MF::vml_abs(*this); +#endif + } + inline realvec<fp16,16> intvec<fp16,16>::as_float() const { return v; @@ -605,6 +684,16 @@ namespace vecmathlib { __builtin_unreachable(); } + inline intvec<fp16,16> intvec<fp16,16>::max(intvec_t x) const + { + return MF::vml_max(*this, x); + } + + inline intvec<fp16,16> intvec<fp16,16>::min(intvec_t x) const + { + return MF::vml_min(*this, x); + } + } // namespace vecmathlib #endif // #ifndef VEC_AVX_FP16_16_H diff --git a/vec_avx_fp8_32.h b/vec_avx_fp8_32.h index 5ed93e4..912bd19 100644 --- a/vec_avx_fp8_32.h +++ b/vec_avx_fp8_32.h @@ -252,7 +252,7 @@ namespace vecmathlib { boolvec_t convert_bool() const { // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare with zero. Instead, we check + // There is no intrinsic to compare to zero. Instead, we check // whether x is positive and x-1 is negative. intvec x = *this; // We know that boolvec values depend only on the sign bit @@ -272,6 +272,9 @@ namespace vecmathlib { intvec operator+(intvec x) const { +#ifdef __AVX2__ + return _mm256_add_epi8(v, x.v); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); __m128i xvlo = _mm256_castsi256_si128(x.v); @@ -279,9 +282,13 @@ namespace vecmathlib { vlo = _mm_add_epi8(vlo, xvlo); vhi = _mm_add_epi8(vhi, xvhi); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec operator-(intvec x) const { +#ifdef __AVX2__ + return _mm256_sub_epi8(v, x.v); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); __m128i xvlo = _mm256_castsi256_si128(x.v); @@ -289,6 +296,7 @@ namespace vecmathlib { vlo = _mm_sub_epi8(vlo, xvlo); vhi = _mm_sub_epi8(vhi, xvhi); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec& operator+=(intvec const& x) { return *this=*this+x; } @@ -300,18 +308,30 @@ namespace vecmathlib { intvec operator&(intvec x) const { +#ifdef __AVX2__ + return _mm256_and_si256(v, x.v); +#else return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); +#endif } intvec operator|(intvec x) const { +#ifdef __AVX2__ + return _mm256_or_si256(v, x.v); +#else return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); +#endif } intvec operator^(intvec x) const { +#ifdef __AVX2__ + return _mm256_xor_si256(v, x.v); +#else return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v), _mm256_castsi256_ps(x.v))); +#endif } intvec& operator&=(intvec const& x) { return *this=*this&x; } @@ -322,6 +342,12 @@ namespace vecmathlib { intvec lsr(int_t n) const { +#ifdef __AVX2__ + uint_t masklo = U(0x00ffU) >> U(n); + uint_t maskhi = U(0xff00U); + intvec mask = masklo | maskhi; + return intvec(_mm256_srai_epi16(v, n)) & mask; +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); uint_t masklo = U(0x00ffU) >> U(n); @@ -330,9 +356,16 @@ namespace vecmathlib { vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask); vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec operator>>(int_t n) const { +#ifdef __AVX2__ + // There is no _mm256_srai_epi8. To emulate it, add 0x80 before + // shifting, and subtract the shifted 0x80 after shifting + intvec_t offset = U(1) << (bits-1); + return (*this + offset).lsr(n) - offset.lsr(n); +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); uint_t masklo = U(0x00ffU); @@ -348,9 +381,16 @@ namespace vecmathlib { _mm_set1_epi16(maskhi)); vhi = _mm_or_si128(vhilo, vhihi); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec operator<<(int_t n) const { +#ifdef __AVX2__ + uint_t masklo = U(0x00ffU); + uint_t maskhi = U(0xff00U) << U(n); + intvec mask = masklo | maskhi; + return intvec(_mm256_slli_epi16(v, n)) & mask; +#else __m128i vlo = _mm256_castsi256_si128(v); __m128i vhi = _mm256_extractf128_si256(v, 1); uint_t masklo = U(0x00ffU); @@ -359,6 +399,7 @@ namespace vecmathlib { vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask); vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask); return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1); +#endif } intvec& operator>>=(int_t n) { return *this=*this>>n; } intvec& operator<<=(int_t n) { return *this=*this<<n; } @@ -394,18 +435,51 @@ namespace vecmathlib { boolvec_t operator==(intvec const& x) const { +#ifdef __AVX2__ + return _mm256_cmpeq_epi8(v, x.v); +#else return ! (*this != x); +#endif } boolvec_t operator!=(intvec const& x) const { +#ifdef __AVX2__ + return ! (*this == x); +#else return (*this ^ x).convert_bool(); +#endif + } + boolvec_t operator<(intvec const& x) const + { +#ifdef __AVX2__ + return _mm256_cmpgt_epi8(x.v, v); +#else + // TODO: First compare sign; then if equal, compare sign of difference + // TODO: Also look for intrinsics + boolvec_t r; + for (int i=0; i<size; ++i) { + r.set_elt(i, (*this)[i] < x[i]); + } + return r; +#endif + } + boolvec_t operator<=(intvec_t const& x) const + { + return ! (*this > x); + } + boolvec_t operator>(intvec_t const& x) const + { + return x < *this; } - // TODO: First compare sign; then if equal, compare sign of difference - // TODO: Also look for intrinsics - boolvec_t operator<(intvec const& x) const { __builtin_unreachable(); } - boolvec_t operator<=(intvec const& x) const { __builtin_unreachable(); } - boolvec_t operator>(intvec const& x) const { __builtin_unreachable(); } - boolvec_t operator>=(intvec const& x) const { __builtin_unreachable(); } + boolvec_t operator>=(intvec_t const& x) const + { + return ! (*this < x); + } + + intvec_t abs() const; + boolvec_t isignbit() const { return as_bool(); } + intvec_t max(intvec_t x) const; + intvec_t min(intvec_t x) const; }; @@ -418,7 +492,13 @@ namespace vecmathlib { typedef __m256i vector_t; static int const alignment = sizeof(vector_t); - static char const* name() { return "<AVX:32*fp8>"; } + static char const* name() { +#ifdef __AVX2__ + return "<AVX2:32*fp8>"; +#else + return "<AVX:32*fp8>"; +#endif + } void barrier() { __asm__("": "+x"(v)); } static_assert(size * sizeof(real_t) == sizeof(vector_t), @@ -647,20 +727,28 @@ namespace vecmathlib { inline intvec<fp8,32> boolvec<fp8,32>::ifthen(intvec_t x, intvec_t y) const { - return ifthen(x.as_float(), y.as_float()).as_int(); + return (( -convert_int() & x) | (~-convert_int() & y)); } inline realvec<fp8,32> boolvec<fp8,32>::ifthen(realvec_t x, realvec_t y) const { - return (( -convert_int() & x.as_int()) | - (~-convert_int() & y.as_int())).as_float(); + return ifthen(x.as_int(), y.as_int()).as_float(); } // intvec definitions + inline intvec<fp8,32> intvec<fp8,32>::abs() const + { +#ifdef __AVX2__ + return _mm256_abs_epi8(v); +#else + return MF::vml_abs(*this); +#endif + } + inline realvec<fp8,32> intvec<fp8,32>::as_float() const { return v; @@ -671,6 +759,16 @@ namespace vecmathlib { __builtin_unreachable(); } + inline intvec<fp8,32> intvec<fp8,32>::max(intvec_t x) const + { + return MF::vml_max(*this, x); + } + + inline intvec<fp8,32> intvec<fp8,32>::min(intvec_t x) const + { + return MF::vml_min(*this, x); + } + } // namespace vecmathlib #endif // #ifndef VEC_AVX_FP8_32_H diff --git a/vec_sse_double2.h b/vec_sse_double2.h index 11790c3..5d64688 100644 --- a/vec_sse_double2.h +++ b/vec_sse_double2.h @@ -188,7 +188,7 @@ namespace vecmathlib { boolvec_t convert_bool() const { // Result: convert_bool(0)=false, convert_bool(else)=true - // There is no intrinsic to compare with zero. Instead, we check + // There is no intrinsic to compare to zero. Instead, we check // whether x is positive and x-1 is negative. intvec_t x = *this; // We know that boolvec_t values depend only on the sign bit |