diff options
author | Erik Schnetter <schnetter@gmail.com> | 2015-10-16 13:45:31 -0400 |
---|---|---|
committer | Erik Schnetter <schnetter@gmail.com> | 2015-10-16 13:45:31 -0400 |
commit | 06ddf9a62fd6e991dfcf0bd2abe0941b960cb2bb (patch) | |
tree | 793b48ebee5b9c779c8afd34cc20245c93b9ad9d /loop.cc | |
parent | 71a4a6821276690eed15df8d2c246be499da1e29 (diff) | |
download | vecmathlib-06ddf9a62fd6e991dfcf0bd2abe0941b960cb2bb.zip vecmathlib-06ddf9a62fd6e991dfcf0bd2abe0941b960cb2bb.tar.gz |
Reformat source code with clang-format
Note: If you have an existing checkout with modifications, then you should:
(1) save your current state on a branch
(2) run clang-format on this branch
(3) compare this branch to a fresh checkout of the clang-formatted master
Since both your branch and master have been formatted with clang-format, this will lead to a very small diff, avoiding spurious changes due to formatting differences.
Diffstat (limited to 'loop.cc')
-rw-r--r-- | loop.cc | 250 |
1 files changed, 111 insertions, 139 deletions
@@ -14,68 +14,57 @@ using namespace std; using namespace vecmathlib; - - //////////////////////////////////////////////////////////////////////////////// // Helpers //////////////////////////////////////////////////////////////////////////////// #ifndef __has_builtin -# define __has_builtin(x) 0 // Compatibility with non-clang compilers +#define __has_builtin(x) 0 // Compatibility with non-clang compilers #endif // align upwards -static size_t align_up(size_t i, size_t size) -{ +static size_t align_up(size_t i, size_t size) { return (i + size - 1) / size * size; } - - //////////////////////////////////////////////////////////////////////////////// // High-resolution timer //////////////////////////////////////////////////////////////////////////////// typedef unsigned long long ticks; -inline ticks getticks() -{ +inline ticks getticks() { #if __has_builtin(__builtin_readcyclecounter) return __builtin_readcyclecounter(); #elif defined __x86_64__ ticks a, d; - asm volatile("rdtsc" : "=a" (a), "=d" (d)); + asm volatile("rdtsc" : "=a"(a), "=d"(d)); return a | (d << 32); #elif defined __powerpc__ unsigned int tbl, tbu, tbu1; do { - asm volatile("mftbu %0": "=r"(tbu)); - asm volatile("mftb %0": "=r"(tbl)); - asm volatile("mftbu %0": "=r"(tbu1)); + asm volatile("mftbu %0" : "=r"(tbu)); + asm volatile("mftb %0" : "=r"(tbl)); + asm volatile("mftbu %0" : "=r"(tbu1)); } while (tbu != tbu1); return ((unsigned long long)tbu << 32) | tbl; #else timeval tv; gettimeofday(&tv, NULL); return 1000000ULL * tv.tv_sec + tv.tv_usec; - // timespec ts; - // clock_gettime(CLOCK_REALTIME, &ts); - // return 1000000000ULL * ts.tv_sec + ts.tv_nsec; +// timespec ts; +// clock_gettime(CLOCK_REALTIME, &ts); +// return 1000000000ULL * ts.tv_sec + ts.tv_nsec; #endif } -inline double elapsed(ticks t1, ticks t0) -{ - return t1-t0; -} +inline double elapsed(ticks t1, ticks t0) { return t1 - t0; } -double get_sys_time() -{ +double get_sys_time() { timeval tp; gettimeofday(&tp, NULL); return tp.tv_sec + 1.0e-6 * tp.tv_usec; } -double measure_tick() -{ +double measure_tick() { ticks const rstart = getticks(); double const wstart = get_sys_time(); while (get_sys_time() - wstart < 0.1) { @@ -83,236 +72,219 @@ double measure_tick() } ticks const rend = getticks(); double const wend = get_sys_time(); - assert(wend-wstart >= 0.09); + assert(wend - wstart >= 0.09); return (wend - wstart) / elapsed(rend, rstart); } - - //////////////////////////////////////////////////////////////////////////////// // Initialize the grid //////////////////////////////////////////////////////////////////////////////// -template<typename realvec_t> -void init(typename realvec_t::real_t *restrict xptr, - ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) -{ - for (ptrdiff_t j=0; j<n; ++j) { - for (ptrdiff_t i=0; i<m; ++i) { - const ptrdiff_t ij = ldm*j + i; - xptr[ij] = (i+j)%2; +template <typename realvec_t> +void init(typename realvec_t::real_t *restrict xptr, ptrdiff_t m, ptrdiff_t ldm, + ptrdiff_t n) { + for (ptrdiff_t j = 0; j < n; ++j) { + for (ptrdiff_t i = 0; i < m; ++i) { + const ptrdiff_t ij = ldm * j + i; + xptr[ij] = (i + j) % 2; } } } - - //////////////////////////////////////////////////////////////////////////////// // Evolution loop: Simple stencil example (Gaussian smoothing) //////////////////////////////////////////////////////////////////////////////// // Introduce a delay, so that cache access is not so important -template<typename T> -static T delay(const T x) -{ +template <typename T> static T delay(const T x) { return x; // return log(exp(x)); } // Original version, unvectorized -template<typename realvec_t> +template <typename realvec_t> void smooth_scalar(typename realvec_t::real_t const *restrict xptr, - typename realvec_t::real_t *restrict yptr, - ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) -{ + typename realvec_t::real_t *restrict yptr, ptrdiff_t m, + ptrdiff_t ldm, ptrdiff_t n) { typedef typename realvec_t::real_t real_t; - for (ptrdiff_t j=1; j<n-1; ++j) { - for (ptrdiff_t i=1; i<m-1; ++i) { - const ptrdiff_t ij = ldm*j + i; - const real_t x = xptr[ij]; - const real_t xil = xptr[ij-1]; - const real_t xir = xptr[ij+1]; - const real_t xjl = xptr[ij-ldm]; - const real_t xjr = xptr[ij+ldm]; + for (ptrdiff_t j = 1; j < n - 1; ++j) { + for (ptrdiff_t i = 1; i < m - 1; ++i) { + const ptrdiff_t ij = ldm * j + i; + const real_t x = xptr[ij]; + const real_t xil = xptr[ij - 1]; + const real_t xir = xptr[ij + 1]; + const real_t xjl = xptr[ij - ldm]; + const real_t xjr = xptr[ij + ldm]; const real_t y = - real_t(0.5) * x + real_t(0.125) * (xil + xir + xjl + xjr); + real_t(0.5) * x + real_t(0.125) * (xil + xir + xjl + xjr); yptr[ij] = delay(y); } } } - - // Assuming no particular alignment -template<typename realvec_t> +template <typename realvec_t> void smooth_unaligned(typename realvec_t::real_t const *restrict xptr, - typename realvec_t::real_t *restrict yptr, - ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) -{ + typename realvec_t::real_t *restrict yptr, ptrdiff_t m, + ptrdiff_t ldm, ptrdiff_t n) { typedef typename realvec_t::real_t real_t; typedef typename realvec_t::mask_t mask_t; - for (ptrdiff_t j=1; j<n-1; ++j) { + for (ptrdiff_t j = 1; j < n - 1; ++j) { // Desired loop bounds const ptrdiff_t imin = 1; - const ptrdiff_t imax = m-1; + const ptrdiff_t imax = m - 1; // Align actual loop iterations with vector size - const ptrdiff_t ioff = ldm*j; + const ptrdiff_t ioff = ldm * j; for (mask_t mask(imin, imax, ioff); mask; ++mask) { const ptrdiff_t i = mask.index(); const ptrdiff_t ij = ioff + i; - const realvec_t x = realvec_t::loadu(xptr+ij); - const realvec_t xil = realvec_t::loadu(xptr+ij, -1); - const realvec_t xir = realvec_t::loadu(xptr+ij, +1); - const realvec_t xjl = realvec_t::loadu(xptr+ij-ldm); - const realvec_t xjr = realvec_t::loadu(xptr+ij+ldm); - const realvec_t y = - realvec_t(real_t(0.5)) * x + - realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); - storeu(delay(y), yptr+ij, mask); + const realvec_t x = realvec_t::loadu(xptr + ij); + const realvec_t xil = realvec_t::loadu(xptr + ij, -1); + const realvec_t xir = realvec_t::loadu(xptr + ij, +1); + const realvec_t xjl = realvec_t::loadu(xptr + ij - ldm); + const realvec_t xjr = realvec_t::loadu(xptr + ij + ldm); + const realvec_t y = realvec_t(real_t(0.5)) * x + + realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); + storeu(delay(y), yptr + ij, mask); } } } - - // Assuming that xptr and yptr are aligned, but ldm can be arbitrary -template<typename realvec_t> +template <typename realvec_t> void smooth_aligned(typename realvec_t::real_t const *restrict xptr, - typename realvec_t::real_t *restrict yptr, - ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) -{ + typename realvec_t::real_t *restrict yptr, ptrdiff_t m, + ptrdiff_t ldm, ptrdiff_t n) { typedef typename realvec_t::real_t real_t; typedef typename realvec_t::mask_t mask_t; - for (ptrdiff_t j=1; j<n-1; ++j) { + for (ptrdiff_t j = 1; j < n - 1; ++j) { // Desired loop bounds const ptrdiff_t imin = 1; - const ptrdiff_t imax = m-1; + const ptrdiff_t imax = m - 1; // Align actual loop iterations with vector size - const ptrdiff_t ioff = ldm*j; + const ptrdiff_t ioff = ldm * j; for (mask_t mask(imin, imax, ioff); mask; ++mask) { const ptrdiff_t i = mask.index(); const ptrdiff_t ij = ioff + i; - const realvec_t x = realvec_t::loada(xptr+ij); - const realvec_t xil = realvec_t::loadu(xptr+ij, -1); - const realvec_t xir = realvec_t::loadu(xptr+ij, +1); - const realvec_t xjl = realvec_t::loadu(xptr+ij-ldm); - const realvec_t xjr = realvec_t::loadu(xptr+ij+ldm); - const realvec_t y = - realvec_t(real_t(0.5)) * x + - realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); - storea(delay(y), yptr+ij, mask); + const realvec_t x = realvec_t::loada(xptr + ij); + const realvec_t xil = realvec_t::loadu(xptr + ij, -1); + const realvec_t xir = realvec_t::loadu(xptr + ij, +1); + const realvec_t xjl = realvec_t::loadu(xptr + ij - ldm); + const realvec_t xjr = realvec_t::loadu(xptr + ij + ldm); + const realvec_t y = realvec_t(real_t(0.5)) * x + + realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); + storea(delay(y), yptr + ij, mask); } } } - - // Assuming that xptr and yptr are aligned, and ldm is a multiple of // the vector size -template<typename realvec_t> +template <typename realvec_t> void smooth_padded(typename realvec_t::real_t const *restrict xptr, - typename realvec_t::real_t *restrict yptr, - ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) -{ + typename realvec_t::real_t *restrict yptr, ptrdiff_t m, + ptrdiff_t ldm, ptrdiff_t n) { typedef typename realvec_t::real_t real_t; typedef typename realvec_t::mask_t mask_t; assert(ldm % realvec_t::size == 0); - for (ptrdiff_t j=1; j<n-1; ++j) { + for (ptrdiff_t j = 1; j < n - 1; ++j) { // Desired loop bounds const ptrdiff_t imin = 1; - const ptrdiff_t imax = m-1; + const ptrdiff_t imax = m - 1; // Align actual loop iterations with vector size - const ptrdiff_t ioff = ldm*j; + const ptrdiff_t ioff = ldm * j; for (mask_t mask(imin, imax, ioff); mask; ++mask) { const ptrdiff_t i = mask.index(); const ptrdiff_t ij = ioff + i; - const realvec_t x = realvec_t::loada(xptr+ij); - const realvec_t xil = realvec_t::loadu(xptr+ij, -1); - const realvec_t xir = realvec_t::loadu(xptr+ij, +1); - const realvec_t xjl = realvec_t::loada(xptr+ij-ldm); - const realvec_t xjr = realvec_t::loada(xptr+ij+ldm); - const realvec_t y = - realvec_t(real_t(0.5)) * x + - realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); - storea(delay(y), yptr+ij, mask); + const realvec_t x = realvec_t::loada(xptr + ij); + const realvec_t xil = realvec_t::loadu(xptr + ij, -1); + const realvec_t xir = realvec_t::loadu(xptr + ij, +1); + const realvec_t xjl = realvec_t::loada(xptr + ij - ldm); + const realvec_t xjr = realvec_t::loada(xptr + ij + ldm); + const realvec_t y = realvec_t(real_t(0.5)) * x + + realvec_t(real_t(0.125)) * (xil + xir + xjl + xjr); + storea(delay(y), yptr + ij, mask); } } } - - //////////////////////////////////////////////////////////////////////////////// // Main routine //////////////////////////////////////////////////////////////////////////////// -int main(int argc, char** argv) -{ +int main(int argc, char **argv) { // Number of iterations const int niters = 100; - + // Grid size const ptrdiff_t m = 100; const ptrdiff_t n = 100; - - // Choose a vector size + +// Choose a vector size #if defined VECMATHLIB_HAVE_VEC_DOUBLE_4 - typedef realvec<double,4> realvec_t; + typedef realvec<double, 4> realvec_t; #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2 - typedef realvec<double,2> realvec_t; + typedef realvec<double, 2> realvec_t; #else - typedef realpseudovec<double,1> realvec_t; + typedef realpseudovec<double, 1> realvec_t; #endif - + // Ensure the grid size is aligned const ptrdiff_t ldm = align_up(m, realvec_t::size); typedef realvec_t::real_t real_t; - vector<real_t> x0(ldm*n + realvec_t::size-1), y0(ldm*n + realvec_t::size-1); - real_t* restrict const x = - (real_t*)align_up(intptr_t(&x0[0]), sizeof(realvec_t)); - real_t* restrict const y = - (real_t*)align_up(intptr_t(&y0[0]), sizeof(realvec_t)); - for (ptrdiff_t i=0; i<ldm*n; ++i) y[i] = 0.0; - + vector<real_t> x0(ldm * n + realvec_t::size - 1), + y0(ldm * n + realvec_t::size - 1); + real_t *restrict const x = + (real_t *)align_up(intptr_t(&x0[0]), sizeof(realvec_t)); + real_t *restrict const y = + (real_t *)align_up(intptr_t(&y0[0]), sizeof(realvec_t)); + for (ptrdiff_t i = 0; i < ldm * n; ++i) + y[i] = 0.0; + // Initialize init<realvec_t>(&x[0], m, ldm, n); - + // Timers ticks t0, t1; double const cycles_per_tick = 1.0; // measure_tick(); double cycles; - + // Run the different evolution loop versions t0 = getticks(); - for (int iter=0; iter<niters; ++iter) { + for (int iter = 0; iter < niters; ++iter) { smooth_scalar<realvec_t>(&x[0], &y[0], m, ldm, n); } t1 = getticks(); - cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); + cycles = + cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters); cout << "smooth_scalar: " << cycles << " cycles/point\n"; - + t0 = getticks(); - for (int iter=0; iter<niters; ++iter) { + for (int iter = 0; iter < niters; ++iter) { smooth_unaligned<realvec_t>(&x[0], &y[0], m, ldm, n); } t1 = getticks(); - cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); + cycles = + cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters); cout << "smooth_unaligned: " << cycles << " cycles/point\n"; - + t0 = getticks(); - for (int iter=0; iter<niters; ++iter) { + for (int iter = 0; iter < niters; ++iter) { smooth_aligned<realvec_t>(&x[0], &y[0], m, ldm, n); } t1 = getticks(); - cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); + cycles = + cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters); cout << "smooth_aligned: " << cycles << " cycles/point\n"; - + t0 = getticks(); - for (int iter=0; iter<niters; ++iter) { + for (int iter = 0; iter < niters; ++iter) { smooth_padded<realvec_t>(&x[0], &y[0], m, ldm, n); } t1 = getticks(); - cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); + cycles = + cycles_per_tick * elapsed(t1, t0) / (1.0 * (n - 1) * (m - 1) * niters); cout << "smooth_padded: " << cycles << " cycles/point\n"; - + return 0; } |