// -*-C++-*- #define restrict __restrict__ #include "vecmathlib.h" #include #include #include #include #include using namespace std; using namespace vecmathlib; //////////////////////////////////////////////////////////////////////////////// // Helpers //////////////////////////////////////////////////////////////////////////////// #ifndef __has_builtin # define __has_builtin(x) 0 // Compatibility with non-clang compilers #endif // align upwards static size_t align_up(size_t i, size_t size) { return (i + size - 1) / size * size; } //////////////////////////////////////////////////////////////////////////////// // High-resolution timer //////////////////////////////////////////////////////////////////////////////// typedef unsigned long long ticks; inline ticks getticks() { #if __has_builtin(__builtin_readcyclecounter) return __builtin_readcyclecounter(); #elif defined __x86_64__ ticks a, d; asm volatile("rdtsc" : "=a" (a), "=d" (d)); return a | (d << 32); #elif defined __powerpc__ unsigned int tbl, tbu, tbu1; do { asm volatile("mftbu %0": "=r"(tbu)); asm volatile("mftb %0": "=r"(tbl)); asm volatile("mftbu %0": "=r"(tbu1)); } while (tbu != tbu1); return ((unsigned long long)tbu << 32) | tbl; #else timeval tv; gettimeofday(&tv, NULL); return 1000000ULL * tv.tv_sec + tv.tv_usec; // timespec ts; // clock_gettime(CLOCK_REALTIME, &ts); // return 1000000000ULL * ts.tv_sec + ts.tv_nsec; #endif } inline double elapsed(ticks t1, ticks t0) { return t1-t0; } double get_sys_time() { timeval tp; gettimeofday(&tp, NULL); return tp.tv_sec + 1.0e-6 * tp.tv_usec; } double measure_tick() { ticks const rstart = getticks(); double const wstart = get_sys_time(); while (get_sys_time() - wstart < 0.1) { // do nothing, just wait } ticks const rend = getticks(); double const wend = get_sys_time(); assert(wend-wstart >= 0.09); return (wend - wstart) / elapsed(rend, rstart); } //////////////////////////////////////////////////////////////////////////////// // Initialize the grid //////////////////////////////////////////////////////////////////////////////// template void init(typename realvec_t::real_t *restrict xptr, ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) { for (ptrdiff_t j=0; j static T delay(const T x) { return x; // return log(exp(x)); } // Original version, unvectorized template void smooth_scalar(typename realvec_t::real_t const *restrict xptr, typename realvec_t::real_t *restrict yptr, ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) { typedef typename realvec_t::real_t real_t; for (ptrdiff_t j=1; j void smooth_unaligned(typename realvec_t::real_t const *restrict xptr, typename realvec_t::real_t *restrict yptr, ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) { typedef typename realvec_t::real_t real_t; typedef typename realvec_t::mask_t mask_t; for (ptrdiff_t j=1; j void smooth_aligned(typename realvec_t::real_t const *restrict xptr, typename realvec_t::real_t *restrict yptr, ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) { typedef typename realvec_t::real_t real_t; typedef typename realvec_t::mask_t mask_t; for (ptrdiff_t j=1; j void smooth_padded(typename realvec_t::real_t const *restrict xptr, typename realvec_t::real_t *restrict yptr, ptrdiff_t m, ptrdiff_t ldm, ptrdiff_t n) { typedef typename realvec_t::real_t real_t; typedef typename realvec_t::mask_t mask_t; assert(ldm % realvec_t::size == 0); for (ptrdiff_t j=1; j realvec_t; #elif defined VECMATHLIB_HAVE_VEC_DOUBLE_2 typedef realvec realvec_t; #else typedef realpseudovec realvec_t; #endif // Ensure the grid size is aligned const ptrdiff_t ldm = align_up(m, realvec_t::size); typedef realvec_t::real_t real_t; vector x0(ldm*n + realvec_t::size-1), y0(ldm*n + realvec_t::size-1); real_t* restrict const x = (real_t*)align_up(intptr_t(&x0[0]), sizeof(realvec_t)); real_t* restrict const y = (real_t*)align_up(intptr_t(&y0[0]), sizeof(realvec_t)); for (ptrdiff_t i=0; i(&x[0], m, ldm, n); // Timers ticks t0, t1; double const cycles_per_tick = 1.0; // measure_tick(); double cycles; // Run the different evolution loop versions t0 = getticks(); for (int iter=0; iter(&x[0], &y[0], m, ldm, n); } t1 = getticks(); cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); cout << "smooth_scalar: " << cycles << " cycles/point\n"; t0 = getticks(); for (int iter=0; iter(&x[0], &y[0], m, ldm, n); } t1 = getticks(); cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); cout << "smooth_unaligned: " << cycles << " cycles/point\n"; t0 = getticks(); for (int iter=0; iter(&x[0], &y[0], m, ldm, n); } t1 = getticks(); cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); cout << "smooth_aligned: " << cycles << " cycles/point\n"; t0 = getticks(); for (int iter=0; iter(&x[0], &y[0], m, ldm, n); } t1 = getticks(); cycles = cycles_per_tick * elapsed(t1,t0) / (1.0 * (n-1) * (m-1) * niters); cout << "smooth_padded: " << cycles << " cycles/point\n"; return 0; }