diff options
author | Erik Schnetter <schnetter@gmail.com> | 2013-07-05 15:50:36 -0400 |
---|---|---|
committer | Erik Schnetter <schnetter@gmail.com> | 2013-07-05 15:50:36 -0400 |
commit | d76ef6a1ca729105e10834713b6d3793eb64d166 (patch) | |
tree | 792baf10bf4ff6dd3fa50ec271f74c70bdc23f5b | |
parent | 5d6a550fe0f459056e695501c3e06f24bc0c9405 (diff) | |
download | vecmathlib-d76ef6a1ca729105e10834713b6d3793eb64d166.zip vecmathlib-d76ef6a1ca729105e10834713b6d3793eb64d166.tar.gz |
Introduce a truly unaligned example
Also, add facility for an explicit delay calculation to de-emphasize cache effects.
-rw-r--r-- | loop.cc | 66 |
1 files changed, 57 insertions, 9 deletions
@@ -106,6 +106,14 @@ void init(typename realvec_t::real_t *restrict xptr, // Evolution loop: Simple stencil example (Gaussian smoothing) //////////////////////////////////////////////////////////////////////////////// +// Introduce a delay, so that cache access is not so important +template<typename T> +static T delay(const T x) +{ + return x; + // return log(exp(x)); +} + // Original version, unvectorized template<typename realvec_t> void smooth_scalar(typename realvec_t::real_t const *restrict xptr, @@ -123,14 +131,14 @@ void smooth_scalar(typename realvec_t::real_t const *restrict xptr, const real_t xjr = xptr[ij+ldm]; const real_t y = real_t(0.5) * x + real_t(0.125) * (xil + xir + xjl + xjr); - yptr[ij] = y; + yptr[ij] = delay(y); } } } -// Assuming that xptr and yptr are aligned, but ldm can be arbitrary +// Assuming no particular alignment template<typename realvec_t> void smooth_unaligned(typename realvec_t::real_t const *restrict xptr, typename realvec_t::real_t *restrict yptr, @@ -147,7 +155,7 @@ void smooth_unaligned(typename realvec_t::real_t const *restrict xptr, for (mask_t mask(imin, imax, ioff); mask; ++mask) { const ptrdiff_t i = mask.index(); const ptrdiff_t ij = ioff + i; - const realvec_t x = realvec_t::loada(xptr+ij); + const realvec_t x = realvec_t::loadu(xptr+ij); const realvec_t xil = realvec_t::loadu(xptr+ij, -1); const realvec_t xir = realvec_t::loadu(xptr+ij, +1); const realvec_t xjl = realvec_t::loadu(xptr+ij-ldm); @@ -155,15 +163,14 @@ void smooth_unaligned(typename realvec_t::real_t const *restrict xptr, const realvec_t y = realvec_t(real_t(0.5)) * x + realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); - y.storea(yptr+ij, mask); + storeu(delay(y), yptr+ij, mask); } } } -// Assuming that xptr and yptr are aligned, and ldm is a multiple of -// the vector size +// Assuming that xptr and yptr are aligned, but ldm can be arbitrary template<typename realvec_t> void smooth_aligned(typename realvec_t::real_t const *restrict xptr, typename realvec_t::real_t *restrict yptr, @@ -171,6 +178,39 @@ void smooth_aligned(typename realvec_t::real_t const *restrict xptr, { typedef typename realvec_t::real_t real_t; typedef typename realvec_t::mask_t mask_t; + for (ptrdiff_t j=1; j<n-1; ++j) { + // Desired loop bounds + const ptrdiff_t imin = 1; + const ptrdiff_t imax = m-1; + // Align actual loop iterations with vector size + const ptrdiff_t ioff = ldm*j; + for (mask_t mask(imin, imax, ioff); mask; ++mask) { + const ptrdiff_t i = mask.index(); + const ptrdiff_t ij = ioff + i; + const realvec_t x = realvec_t::loada(xptr+ij); + const realvec_t xil = realvec_t::loadu(xptr+ij, -1); + const realvec_t xir = realvec_t::loadu(xptr+ij, +1); + const realvec_t xjl = realvec_t::loadu(xptr+ij-ldm); + const realvec_t xjr = realvec_t::loadu(xptr+ij+ldm); + const realvec_t y = + realvec_t(real_t(0.5)) * x + + realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); + storea(delay(y), yptr+ij, mask); + } + } +} + + + +// Assuming that xptr and yptr are aligned, and ldm is a multiple of +// the vector size +template<typename realvec_t> +void smooth_padded(typename realvec_t::real_t const *restrict xptr, + typename realvec_t::real_t *restrict yptr, + ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) +{ + typedef typename realvec_t::real_t real_t; + typedef typename realvec_t::mask_t mask_t; assert(ldm % realvec_t::size == 0); for (ptrdiff_t j=1; j<n-1; ++j) { // Desired loop bounds @@ -189,7 +229,7 @@ void smooth_aligned(typename realvec_t::real_t const *restrict xptr, const realvec_t y = realvec_t(real_t(0.5)) * x + realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); - y.storea(yptr+ij, mask); + storea(delay(y), yptr+ij, mask); } } } @@ -206,8 +246,8 @@ int main(int argc, char** argv) const int niters = 100; // Grid size - const ptrdiff_t m = 1000; - const ptrdiff_t n = 1000; + const ptrdiff_t m = 100; + const ptrdiff_t n = 100; // Choose a vector size #if defined VECMATHLIB_HAVE_VEC_DOUBLE_4 @@ -256,5 +296,13 @@ int main(int argc, char** argv) cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); cout << "smooth_aligned: " << cycles << " cycles/point\n"; + t0 = getticks(); + for (int iter=0; iter<niters; ++iter) { + smooth_padded<realvec_t>(&x[0], &y[0], m, ldm, n); + } + t1 = getticks(); + cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); + cout << "smooth_padded: " << cycles << " cycles/point\n"; + return 0; } |