summaryrefslogtreecommitdiffstats
path: root/vec_avx_fp8_32.h
diff options
context:
space:
mode:
Diffstat (limited to 'vec_avx_fp8_32.h')
-rw-r--r--vec_avx_fp8_32.h120
1 files changed, 109 insertions, 11 deletions
diff --git a/vec_avx_fp8_32.h b/vec_avx_fp8_32.h
index 5ed93e4..912bd19 100644
--- a/vec_avx_fp8_32.h
+++ b/vec_avx_fp8_32.h
@@ -252,7 +252,7 @@ namespace vecmathlib {
boolvec_t convert_bool() const
{
// Result: convert_bool(0)=false, convert_bool(else)=true
- // There is no intrinsic to compare with zero. Instead, we check
+ // There is no intrinsic to compare to zero. Instead, we check
// whether x is positive and x-1 is negative.
intvec x = *this;
// We know that boolvec values depend only on the sign bit
@@ -272,6 +272,9 @@ namespace vecmathlib {
intvec operator+(intvec x) const
{
+#ifdef __AVX2__
+ return _mm256_add_epi8(v, x.v);
+#else
__m128i vlo = _mm256_castsi256_si128(v);
__m128i vhi = _mm256_extractf128_si256(v, 1);
__m128i xvlo = _mm256_castsi256_si128(x.v);
@@ -279,9 +282,13 @@ namespace vecmathlib {
vlo = _mm_add_epi8(vlo, xvlo);
vhi = _mm_add_epi8(vhi, xvhi);
return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
}
intvec operator-(intvec x) const
{
+#ifdef __AVX2__
+ return _mm256_sub_epi8(v, x.v);
+#else
__m128i vlo = _mm256_castsi256_si128(v);
__m128i vhi = _mm256_extractf128_si256(v, 1);
__m128i xvlo = _mm256_castsi256_si128(x.v);
@@ -289,6 +296,7 @@ namespace vecmathlib {
vlo = _mm_sub_epi8(vlo, xvlo);
vhi = _mm_sub_epi8(vhi, xvhi);
return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
}
intvec& operator+=(intvec const& x) { return *this=*this+x; }
@@ -300,18 +308,30 @@ namespace vecmathlib {
intvec operator&(intvec x) const
{
+#ifdef __AVX2__
+ return _mm256_and_si256(v, x.v);
+#else
return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v),
_mm256_castsi256_ps(x.v)));
+#endif
}
intvec operator|(intvec x) const
{
+#ifdef __AVX2__
+ return _mm256_or_si256(v, x.v);
+#else
return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v),
_mm256_castsi256_ps(x.v)));
+#endif
}
intvec operator^(intvec x) const
{
+#ifdef __AVX2__
+ return _mm256_xor_si256(v, x.v);
+#else
return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v),
_mm256_castsi256_ps(x.v)));
+#endif
}
intvec& operator&=(intvec const& x) { return *this=*this&x; }
@@ -322,6 +342,12 @@ namespace vecmathlib {
intvec lsr(int_t n) const
{
+#ifdef __AVX2__
+ uint_t masklo = U(0x00ffU) >> U(n);
+ uint_t maskhi = U(0xff00U);
+ intvec mask = masklo | maskhi;
+ return intvec(_mm256_srai_epi16(v, n)) & mask;
+#else
__m128i vlo = _mm256_castsi256_si128(v);
__m128i vhi = _mm256_extractf128_si256(v, 1);
uint_t masklo = U(0x00ffU) >> U(n);
@@ -330,9 +356,16 @@ namespace vecmathlib {
vlo = _mm_and_si128(_mm_srli_epi16(vlo, n), mask);
vhi = _mm_and_si128(_mm_srli_epi16(vhi, n), mask);
return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
}
intvec operator>>(int_t n) const
{
+#ifdef __AVX2__
+ // There is no _mm256_srai_epi8. To emulate it, add 0x80 before
+ // shifting, and subtract the shifted 0x80 after shifting
+ intvec_t offset = U(1) << (bits-1);
+ return (*this + offset).lsr(n) - offset.lsr(n);
+#else
__m128i vlo = _mm256_castsi256_si128(v);
__m128i vhi = _mm256_extractf128_si256(v, 1);
uint_t masklo = U(0x00ffU);
@@ -348,9 +381,16 @@ namespace vecmathlib {
_mm_set1_epi16(maskhi));
vhi = _mm_or_si128(vhilo, vhihi);
return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
}
intvec operator<<(int_t n) const
{
+#ifdef __AVX2__
+ uint_t masklo = U(0x00ffU);
+ uint_t maskhi = U(0xff00U) << U(n);
+ intvec mask = masklo | maskhi;
+ return intvec(_mm256_slli_epi16(v, n)) & mask;
+#else
__m128i vlo = _mm256_castsi256_si128(v);
__m128i vhi = _mm256_extractf128_si256(v, 1);
uint_t masklo = U(0x00ffU);
@@ -359,6 +399,7 @@ namespace vecmathlib {
vlo = _mm_and_si128(_mm_slli_epi16(vlo, n), mask);
vhi = _mm_and_si128(_mm_slli_epi16(vhi, n), mask);
return _mm256_insertf128_si256(_mm256_castsi128_si256(vlo), vhi, 1);
+#endif
}
intvec& operator>>=(int_t n) { return *this=*this>>n; }
intvec& operator<<=(int_t n) { return *this=*this<<n; }
@@ -394,18 +435,51 @@ namespace vecmathlib {
boolvec_t operator==(intvec const& x) const
{
+#ifdef __AVX2__
+ return _mm256_cmpeq_epi8(v, x.v);
+#else
return ! (*this != x);
+#endif
}
boolvec_t operator!=(intvec const& x) const
{
+#ifdef __AVX2__
+ return ! (*this == x);
+#else
return (*this ^ x).convert_bool();
+#endif
+ }
+ boolvec_t operator<(intvec const& x) const
+ {
+#ifdef __AVX2__
+ return _mm256_cmpgt_epi8(x.v, v);
+#else
+ // TODO: First compare sign; then if equal, compare sign of difference
+ // TODO: Also look for intrinsics
+ boolvec_t r;
+ for (int i=0; i<size; ++i) {
+ r.set_elt(i, (*this)[i] < x[i]);
+ }
+ return r;
+#endif
+ }
+ boolvec_t operator<=(intvec_t const& x) const
+ {
+ return ! (*this > x);
+ }
+ boolvec_t operator>(intvec_t const& x) const
+ {
+ return x < *this;
}
- // TODO: First compare sign; then if equal, compare sign of difference
- // TODO: Also look for intrinsics
- boolvec_t operator<(intvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator<=(intvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator>(intvec const& x) const { __builtin_unreachable(); }
- boolvec_t operator>=(intvec const& x) const { __builtin_unreachable(); }
+ boolvec_t operator>=(intvec_t const& x) const
+ {
+ return ! (*this < x);
+ }
+
+ intvec_t abs() const;
+ boolvec_t isignbit() const { return as_bool(); }
+ intvec_t max(intvec_t x) const;
+ intvec_t min(intvec_t x) const;
};
@@ -418,7 +492,13 @@ namespace vecmathlib {
typedef __m256i vector_t;
static int const alignment = sizeof(vector_t);
- static char const* name() { return "<AVX:32*fp8>"; }
+ static char const* name() {
+#ifdef __AVX2__
+ return "<AVX2:32*fp8>";
+#else
+ return "<AVX:32*fp8>";
+#endif
+ }
void barrier() { __asm__("": "+x"(v)); }
static_assert(size * sizeof(real_t) == sizeof(vector_t),
@@ -647,20 +727,28 @@ namespace vecmathlib {
inline intvec<fp8,32> boolvec<fp8,32>::ifthen(intvec_t x, intvec_t y) const
{
- return ifthen(x.as_float(), y.as_float()).as_int();
+ return (( -convert_int() & x) | (~-convert_int() & y));
}
inline
realvec<fp8,32> boolvec<fp8,32>::ifthen(realvec_t x, realvec_t y) const
{
- return (( -convert_int() & x.as_int()) |
- (~-convert_int() & y.as_int())).as_float();
+ return ifthen(x.as_int(), y.as_int()).as_float();
}
// intvec definitions
+ inline intvec<fp8,32> intvec<fp8,32>::abs() const
+ {
+#ifdef __AVX2__
+ return _mm256_abs_epi8(v);
+#else
+ return MF::vml_abs(*this);
+#endif
+ }
+
inline realvec<fp8,32> intvec<fp8,32>::as_float() const
{
return v;
@@ -671,6 +759,16 @@ namespace vecmathlib {
__builtin_unreachable();
}
+ inline intvec<fp8,32> intvec<fp8,32>::max(intvec_t x) const
+ {
+ return MF::vml_max(*this, x);
+ }
+
+ inline intvec<fp8,32> intvec<fp8,32>::min(intvec_t x) const
+ {
+ return MF::vml_min(*this, x);
+ }
+
} // namespace vecmathlib
#endif // #ifndef VEC_AVX_FP8_32_H
OpenPOWER on IntegriCloud