// Instantiante some functions to be able to inspect the generated // machine code #define VML_NODEBUG #define restrict __restrict__ #include "vecmathlib.h" namespace vecmathlib { template typename realvec_t::real_t get_elt(realvec_t x) { return x[n]; } template realvec_t set_elt(realvec_t x, typename realvec_t::real_t a) { return x.set_elt(n, a); } // template realbuiltinvec fabs(realbuiltinvec x); // template realbuiltinvec fmin(realbuiltinvec x, // realbuiltinvec y); // template intbuiltinvec lsr(intbuiltinvec x, // intbuiltinvec::int_t n); // template intbuiltinvec lsr(intbuiltinvec x, // intbuiltinvec::int_t n); // template intbuiltinvec lsr(intbuiltinvec x, // intbuiltinvec::int_t n); // template intbuiltinvec lsr(intbuiltinvec x, // intbuiltinvec n); // template realbuiltinvec ifthen(realbuiltinvec::boolvec_t c, // realbuiltinvec x, realbuiltinvec y); // template realbuiltinvec ifthen(realbuiltinvec::boolvec_t // c, realbuiltinvec x, realbuiltinvec y); // template realbuiltinvec ifthen(realbuiltinvec::boolvec_t c, // realbuiltinvec x, realbuiltinvec y); // template realbuiltinvec ifthen(realbuiltinvec::boolvec_t // c, realbuiltinvec x, realbuiltinvec y); #ifdef VECMATHLIB_HAVE_VEC_FLOAT_1 template realvec round(realvec x); #endif #ifdef VECMATHLIB_HAVE_VEC_FLOAT_8 template intvec popcount(intvec); #endif #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_1 template realvec exp(realvec x); template realvec log(realvec x); template realvec sin(realvec x); template realvec sqrt(realvec x); template realvec::real_t get_elt, 0>(realvec x); template realvec set_elt, 0>(realvec x, realvec::real_t a); #endif #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_2 template realvec exp(realvec x); template realvec log(realvec x); template realvec sin(realvec x); template realvec sqrt(realvec x); template realvec::real_t get_elt, 0>(realvec); template realvec::real_t get_elt, 1>(realvec); template realvec set_elt, 0>(realvec x, realvec::real_t a); template realvec set_elt, 1>(realvec x, realvec::real_t a); #endif #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_4 template realvec exp(realvec x); template realvec log(realvec x); template realvec sin(realvec x); template realvec sqrt(realvec x); template realvec::real_t get_elt, 0>(realvec); template realvec::real_t get_elt, 1>(realvec); template realvec::real_t get_elt, 2>(realvec); template realvec::real_t get_elt, 3>(realvec); template realvec set_elt, 0>(realvec x, realvec::real_t a); template realvec set_elt, 1>(realvec x, realvec::real_t a); template realvec set_elt, 2>(realvec x, realvec::real_t a); template realvec set_elt, 3>(realvec x, realvec::real_t a); template intvec popcount(intvec); #endif } // Various tests to detect auto-vectorization features #include #include using namespace std; using namespace vecmathlib; #if defined VECMATHLIB_HAVE_VEC_DOUBLE_4 typedef realvec realV; #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2 typedef realvec realV; #elif defined VECMATHLIB_HAVE_VEC_FLOAT_8 typedef realvec realV; #elif defined VECMATHLIB_HAVE_VEC_FLOAT_4 typedef realvec realV; #elif defined VECMATHLIB_HAVE_VEC_FLOAT_2 typedef realvec realV; #else #error "There are no vector types" #endif typedef realV::scalar_t real; const int vecsize = realV::size; // Simple, naive loop adding two arrays extern "C" void loop_add(real *a, real *b, real *c, ptrdiff_t n) { for (ptrdiff_t i = 0; i < n; i += vecsize) { realV tmpb = realV::loadu(&b[i]); realV tmpc = realV::loadu(&c[i]); realV tmpa = tmpb + tmpc; storeu(tmpa, &a[i]); } } // Declare pointers as restrict extern "C" void loop_add_restrict(real *restrict a, real *restrict b, real *restrict c, ptrdiff_t n) { for (ptrdiff_t i = 0; i < n; i += vecsize) { realV tmpb = realV::loadu(&b[i]); realV tmpc = realV::loadu(&c[i]); realV tmpa = tmpb + tmpc; storeu(tmpa, &a[i]); } } // Declare pointers as restrict and aligned extern "C" void loop_add_aligned(real *restrict a, real *restrict b, real *restrict c, ptrdiff_t n) { for (ptrdiff_t i = 0; i < n; i += vecsize) { realV tmpb = realV::loada(&b[i]); realV tmpc = realV::loada(&c[i]); realV tmpa = tmpb + tmpc; storea(tmpa, &a[i]); } } // Reduction loop extern "C" real loop_dot_reduce(real *restrict a, real *restrict b, ptrdiff_t n) { realV sumV = 0.0; for (ptrdiff_t i = 0; i < n; i += vecsize) { realV tmpa = realV::loada(&a[i]); realV tmpb = realV::loada(&b[i]); sumV += tmpa * tmpb; } return sum(sumV); } // Loop with a simple if condition (fmax) extern "C" void loop_if_simple(real *restrict a, real *restrict b, real *restrict c, ptrdiff_t n) { for (ptrdiff_t i = 0; i < n; i += vecsize) { realV tmpb = realV::loada(&b[i]); realV tmpc = realV::loada(&c[i]); realV tmpa = ifthen(tmpb > tmpc, tmpb, tmpc); storea(tmpa, &a[i]); } } // Loop with a complex if condition (select) extern "C" void loop_if(real *restrict a, real *restrict b, real *restrict c, ptrdiff_t n) { for (ptrdiff_t i = 0; i < n; i += vecsize) { realV tmpb = realV::loada(&b[i]); realV tmpc = realV::loada(&c[i]); realV tmpa = ifthen(tmpb > realV(0.0), tmpb * tmpc, realV(1.0)); storea(tmpa, &a[i]); } } // Skip ghost points extern "C" void loop_add_masked(real *restrict a, real *restrict b, real *restrict c, ptrdiff_t n) { for (realV::mask_t mask(1, n - 1, 0); mask; ++mask) { ptrdiff_t i = mask.index(); realV tmpb = realV::loada(&b[i]); realV tmpc = realV::loada(&c[i]); realV tmpa = tmpb + tmpc; storea(tmpa, &a[i], mask); } }