// -*-C++-*- #define VML_NODEBUG #include "vecmathlib.h" #include #include #include #include #include #include #include using namespace std; using namespace vecmathlib; #ifndef __has_builtin # define __has_builtin(x) 0 // Compatibility with non-clang compilers #endif typedef unsigned long long ticks; inline ticks getticks() { #if __has_builtin(__builtin_readcyclecounter) return __builtin_readcyclecounter(); #elif defined __x86_64__ ticks a, d; asm volatile("rdtsc" : "=a" (a), "=d" (d)); return a | (d << 32); #elif defined __powerpc__ unsigned int tbl, tbu, tbu1; do { asm volatile("mftbu %0": "=r"(tbu)); asm volatile("mftb %0": "=r"(tbl)); asm volatile("mftbu %0": "=r"(tbu1)); } while (tbu != tbu1); return ((unsigned long long)tbu << 32) | tbl; #else timeval tv; gettimeofday(&tv, NULL); return 1000000ULL * tv.tv_sec + tv.tv_usec; // timespec ts; // clock_gettime(CLOCK_REALTIME, &ts); // return 1000000000ULL * ts.tv_sec + ts.tv_nsec; #endif } inline double elapsed(ticks t1, ticks t0) { return t1-t0; } double get_sys_time() { timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + 1.0e-6 * tv.tv_usec; // timespec ts; // clock_gettime(CLOCK_REALTIME, &ts); // return ts.tv_sec + 1.0e-9 * ts.tv_nsec; } double measure_tick() { ticks const rstart = getticks(); double const wstart = get_sys_time(); while (get_sys_time() - wstart < 0.1) { // do nothing, just wait } ticks const rend = getticks(); double const wend = get_sys_time(); assert(wend-wstart >= 0.09); return (wend - wstart) / elapsed(rend, rstart); } double global_result = 0.0; template void save_result(realvec_t result) { for (int i=0; i inline T nop(T x) { return x; } template inline T fneg(T x) { return -x; } template inline T fadd(T x, T y) { return x+y; } template inline T fsub(T x, T y) { return x-y; } template inline T fmul(T x, T y) { return x*y; } template inline T fdiv(T x, T y) { return x/y; } template inline T frexp0(T x) { typename T::intvec_t ir; return frexp(x, &ir); } template inline typename T::intvec_t frexp1(T x) { typename T::intvec_t ir; frexp(x, &ir); return ir; } template inline T ldexps(T x, T y) { typename T::intvec_t iy = convert_int(y); return ldexp(x, iy[0]); } template inline T ldexpv(T x, T y) { typename T::intvec_t iy = convert_int(y); return ldexp(x, iy); } #define DECLARE_FUNCTOR(FUNC, XMIN, XMAX) \ template \ struct functor_##FUNC { \ static typename T::real_t get_xmin() { return XMIN; } \ static typename T::real_t get_xmax() { return XMAX; } \ static const char* name() { return #FUNC; } \ T operator()(T x) { \ return FUNC(x); \ } \ } #define DECLARE_BFUNCTOR(FUNC, XMIN, XMAX) \ template \ struct functor_##FUNC { \ static typename T::real_t get_xmin() { return XMIN; } \ static typename T::real_t get_xmax() { return XMAX; } \ static const char* name() { return #FUNC; } \ T operator()(T x) { \ typename T::boolvec_t res = FUNC(x); \ return convert_float(convert_int(res)); \ } \ } #define DECLARE_IFUNCTOR(FUNC, XMIN, XMAX) \ template \ struct functor_##FUNC { \ static typename T::real_t get_xmin() { return XMIN; } \ static typename T::real_t get_xmax() { return XMAX; } \ static const char* name() { return #FUNC; } \ T operator()(T x) { \ typename T::intvec_t res = FUNC(x); \ return convert_float(res); \ } \ } #define DECLARE_FUNCTOR2(FUNC, XMIN, XMAX, YOFFSET) \ template \ struct functor_##FUNC { \ static typename T::real_t get_xmin() { return XMIN; } \ static typename T::real_t get_xmax() { return XMAX; } \ static const char* name() { return #FUNC; } \ T operator()(T x) { \ const typename T::real_t yoffset = YOFFSET; \ return FUNC(x, x + T(yoffset)); \ } \ } #define DECLARE_FUNCTOR3(FUNC, XMIN, XMAX, YOFFSET, ZOFFSET) \ template \ struct functor_##FUNC { \ static typename T::real_t get_xmin() { return XMIN; } \ static typename T::real_t get_xmax() { return XMAX; } \ static const char* name() { return #FUNC; } \ T operator()(T x) { \ const typename T::real_t yoffset = YOFFSET; \ const typename T::real_t zoffset = ZOFFSET; \ return FUNC(x, x + T(yoffset), x + T(zoffset)); \ } \ } DECLARE_FUNCTOR(nop, 0.0, 1.0); DECLARE_FUNCTOR(fneg, 0.0, 1.0); DECLARE_FUNCTOR2(fadd, 0.0, 1.0, 2.0); DECLARE_FUNCTOR2(fsub, 0.0, 1.0, 2.0); DECLARE_FUNCTOR2(fmul, 0.0, 1.0, 2.0); DECLARE_FUNCTOR2(fdiv, 0.0, 1.0, 2.0); DECLARE_FUNCTOR(acos, -0.5, +0.5); DECLARE_FUNCTOR(acosh, 1.0, 2.0); DECLARE_FUNCTOR(asin, -0.5, +0.5); DECLARE_FUNCTOR(asinh, -1.0, +1.0); DECLARE_FUNCTOR(atan, -1.0, +1.0); DECLARE_FUNCTOR2(atan2, 0.0, 1.0, 2.0); DECLARE_FUNCTOR(atanh, -0.5, +0.5); DECLARE_FUNCTOR(cbrt, -1.0, 1.0); DECLARE_FUNCTOR(ceil, -1.0, +1.0); DECLARE_FUNCTOR2(copysign, -1.0, +1.0, 2.0); DECLARE_FUNCTOR(cos, 0.0, 1.0); DECLARE_FUNCTOR(cosh, 0.0, 1.0); DECLARE_FUNCTOR(exp, 0.0, 1.0); DECLARE_FUNCTOR(exp10, 0.0, 1.0); DECLARE_FUNCTOR(exp2, 0.0, 1.0); DECLARE_FUNCTOR(expm1, 0.0, 1.0); DECLARE_FUNCTOR(fabs, -1.0, 1.0); DECLARE_FUNCTOR(floor, -1.0, +1.0); DECLARE_FUNCTOR2(fdim, 0.0, 1.0, 2.0); DECLARE_FUNCTOR3(fma, 0.0, 1.0, 2.0, 3.0); DECLARE_FUNCTOR2(fmax, 0.0, 1.0, 2.0); DECLARE_FUNCTOR2(fmin, 0.0, 1.0, 2.0); DECLARE_FUNCTOR2(fmod, 0.0, 1.0, 2.0); DECLARE_FUNCTOR(frexp0, 1.0, 100.0); DECLARE_IFUNCTOR(frexp1, 1.0, 100.0); DECLARE_FUNCTOR2(hypot, 0.0, 1.0, 2.0); DECLARE_IFUNCTOR(ilogb, 1.0, 100.0); DECLARE_BFUNCTOR(isfinite, 0.0, 1.0); DECLARE_BFUNCTOR(isinf, 0.0, 1.0); DECLARE_BFUNCTOR(isnan, 0.0, 1.0); DECLARE_BFUNCTOR(isnormal, 0.0, 1.0); DECLARE_FUNCTOR2(ldexps, 1.0, 20.0, -10.0); DECLARE_FUNCTOR2(ldexpv, 1.0, 20.0, -10.0); DECLARE_FUNCTOR(log, 1.0, 2.0); DECLARE_FUNCTOR(log10, 1.0, 2.0); DECLARE_FUNCTOR(log1p, 0.0, 1.0); DECLARE_FUNCTOR(log2, 1.0, 2.0); DECLARE_FUNCTOR2(nextafter, -1.0, +1.0, 0.0); DECLARE_FUNCTOR2(pow, 0.0, 1.0, 2.0); DECLARE_FUNCTOR(rcp, 1.0, 2.0); DECLARE_FUNCTOR2(remainder, 0.0, 1.0, 2.0); DECLARE_FUNCTOR(rint, -1.0, +1.0); DECLARE_FUNCTOR(round, -1.0, +1.0); DECLARE_FUNCTOR(rsqrt, 1.0, 2.0); DECLARE_BFUNCTOR(signbit, -1.0, +1.0); DECLARE_FUNCTOR(sin, 0.0, 1.0); DECLARE_FUNCTOR(sinh, -1.0, +1.0); DECLARE_FUNCTOR(sqrt, 0.0, 1.0); DECLARE_FUNCTOR(tan, 0.0, 1.0); DECLARE_FUNCTOR(tanh, -1.0, +1.0); DECLARE_FUNCTOR(trunc, -1.0, +1.0); template class func_t> double run_bench() { const int numiters = 1000000; typedef typename realvec_t::real_t real_t; const real_t xmin = func_t::get_xmin(); const real_t xmax = func_t::get_xmax(); realvec_t x0, dx; for (int i=0; i func; t0 = getticks(); x = y = x0; for (int n=0; n class func_t> void bench_type_func() { cout << " " << setw(-5) << func_t::name() << " " << setw(18) << realvec_t::name() << ": " << flush; double const cycles = run_bench(); cout << cycles << " cycles\n" << flush; } template class func_t> void bench_func() { cout << "\n" << "Benchmarking " << func_t().name() << ":\n"; // Note: We benchmark neither testvec (since this is known to be // slow), nor builtinvec (since this has about the same performance // as pseudovec, and is also not very efficient). bench_type_func, func_t>(); #ifdef __clang__ bench_type_func, func_t>(); #endif bench_type_func, func_t>(); #ifdef VECMATHLIB_HAVE_VEC_FLOAT_1 bench_type_func, func_t>(); #endif #ifdef VECMATHLIB_HAVE_VEC_FLOAT_2 bench_type_func, func_t>(); #ifdef __clang__ bench_type_func, func_t>(); #endif // bench_type_func, func_t>(); bench_type_func, func_t>(); #endif #ifdef VECMATHLIB_HAVE_VEC_FLOAT_4 bench_type_func, func_t>(); #ifdef __clang__ bench_type_func, func_t>(); #endif // bench_type_func, func_t>(); bench_type_func, func_t>(); #endif #ifdef VECMATHLIB_HAVE_VEC_FLOAT_8 bench_type_func, func_t>(); #ifdef __clang__ bench_type_func, func_t>(); #endif // bench_type_func, func_t>(); bench_type_func, func_t>(); #endif bench_type_func, func_t>(); #ifdef __clang__ bench_type_func, func_t>(); #endif bench_type_func, func_t>(); #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_1 bench_type_func, func_t>(); #endif #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_2 bench_type_func, func_t>(); #ifdef __clang__ bench_type_func, func_t>(); #endif // bench_type_func, func_t>(); bench_type_func, func_t>(); #endif #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_4 bench_type_func, func_t>(); #ifdef __clang__ bench_type_func, func_t>(); #endif // bench_type_func, func_t>(); bench_type_func, func_t>(); #endif #ifdef VECMATHLIB_HAVE_VEC_DOUBLE_8 bench_type_func, func_t>(); #ifdef __clang__ bench_type_func, func_t>(); #endif // bench_type_func, func_t>(); bench_type_func, func_t>(); #endif } void bench() { bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); bench_func(); } int main(int argc, char** argv) { cout << "Benchmarking math functions:\n"; bench(); return 0; }