summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--loop.cc66
1 files changed, 57 insertions, 9 deletions
diff --git a/loop.cc b/loop.cc
index 33199a7..fe6955e 100644
--- a/loop.cc
+++ b/loop.cc
@@ -106,6 +106,14 @@ void init(typename realvec_t::real_t *restrict xptr,
// Evolution loop: Simple stencil example (Gaussian smoothing)
////////////////////////////////////////////////////////////////////////////////
+// Introduce a delay, so that cache access is not so important
+template<typename T>
+static T delay(const T x)
+{
+ return x;
+ // return log(exp(x));
+}
+
// Original version, unvectorized
template<typename realvec_t>
void smooth_scalar(typename realvec_t::real_t const *restrict xptr,
@@ -123,14 +131,14 @@ void smooth_scalar(typename realvec_t::real_t const *restrict xptr,
const real_t xjr = xptr[ij+ldm];
const real_t y =
real_t(0.5) * x + real_t(0.125) * (xil + xir + xjl + xjr);
- yptr[ij] = y;
+ yptr[ij] = delay(y);
}
}
}
-// Assuming that xptr and yptr are aligned, but ldm can be arbitrary
+// Assuming no particular alignment
template<typename realvec_t>
void smooth_unaligned(typename realvec_t::real_t const *restrict xptr,
typename realvec_t::real_t *restrict yptr,
@@ -147,7 +155,7 @@ void smooth_unaligned(typename realvec_t::real_t const *restrict xptr,
for (mask_t mask(imin, imax, ioff); mask; ++mask) {
const ptrdiff_t i = mask.index();
const ptrdiff_t ij = ioff + i;
- const realvec_t x = realvec_t::loada(xptr+ij);
+ const realvec_t x = realvec_t::loadu(xptr+ij);
const realvec_t xil = realvec_t::loadu(xptr+ij, -1);
const realvec_t xir = realvec_t::loadu(xptr+ij, +1);
const realvec_t xjl = realvec_t::loadu(xptr+ij-ldm);
@@ -155,15 +163,14 @@ void smooth_unaligned(typename realvec_t::real_t const *restrict xptr,
const realvec_t y =
realvec_t(real_t(0.5)) * x +
realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
- y.storea(yptr+ij, mask);
+ storeu(delay(y), yptr+ij, mask);
}
}
}
-// Assuming that xptr and yptr are aligned, and ldm is a multiple of
-// the vector size
+// Assuming that xptr and yptr are aligned, but ldm can be arbitrary
template<typename realvec_t>
void smooth_aligned(typename realvec_t::real_t const *restrict xptr,
typename realvec_t::real_t *restrict yptr,
@@ -171,6 +178,39 @@ void smooth_aligned(typename realvec_t::real_t const *restrict xptr,
{
typedef typename realvec_t::real_t real_t;
typedef typename realvec_t::mask_t mask_t;
+ for (ptrdiff_t j=1; j<n-1; ++j) {
+ // Desired loop bounds
+ const ptrdiff_t imin = 1;
+ const ptrdiff_t imax = m-1;
+ // Align actual loop iterations with vector size
+ const ptrdiff_t ioff = ldm*j;
+ for (mask_t mask(imin, imax, ioff); mask; ++mask) {
+ const ptrdiff_t i = mask.index();
+ const ptrdiff_t ij = ioff + i;
+ const realvec_t x = realvec_t::loada(xptr+ij);
+ const realvec_t xil = realvec_t::loadu(xptr+ij, -1);
+ const realvec_t xir = realvec_t::loadu(xptr+ij, +1);
+ const realvec_t xjl = realvec_t::loadu(xptr+ij-ldm);
+ const realvec_t xjr = realvec_t::loadu(xptr+ij+ldm);
+ const realvec_t y =
+ realvec_t(real_t(0.5)) * x +
+ realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
+ storea(delay(y), yptr+ij, mask);
+ }
+ }
+}
+
+
+
+// Assuming that xptr and yptr are aligned, and ldm is a multiple of
+// the vector size
+template<typename realvec_t>
+void smooth_padded(typename realvec_t::real_t const *restrict xptr,
+ typename realvec_t::real_t *restrict yptr,
+ ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n)
+{
+ typedef typename realvec_t::real_t real_t;
+ typedef typename realvec_t::mask_t mask_t;
assert(ldm % realvec_t::size == 0);
for (ptrdiff_t j=1; j<n-1; ++j) {
// Desired loop bounds
@@ -189,7 +229,7 @@ void smooth_aligned(typename realvec_t::real_t const *restrict xptr,
const realvec_t y =
realvec_t(real_t(0.5)) * x +
realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr);
- y.storea(yptr+ij, mask);
+ storea(delay(y), yptr+ij, mask);
}
}
}
@@ -206,8 +246,8 @@ int main(int argc, char** argv)
const int niters = 100;
// Grid size
- const ptrdiff_t m = 1000;
- const ptrdiff_t n = 1000;
+ const ptrdiff_t m = 100;
+ const ptrdiff_t n = 100;
// Choose a vector size
#if defined VECMATHLIB_HAVE_VEC_DOUBLE_4
@@ -256,5 +296,13 @@ int main(int argc, char** argv)
cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters);
cout << "smooth_aligned: " << cycles << " cycles/point\n";
+ t0 = getticks();
+ for (int iter=0; iter<niters; ++iter) {
+ smooth_padded<realvec_t>(&x[0], &y[0], m, ldm, n);
+ }
+ t1 = getticks();
+ cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters);
+ cout << "smooth_padded: " << cycles << " cycles/point\n";
+
return 0;
}
OpenPOWER on IntegriCloud