diff options
Diffstat (limited to 'lib/tsan/benchmarks')
-rw-r--r-- | lib/tsan/benchmarks/mini_bench_local.cc | 49 | ||||
-rw-r--r-- | lib/tsan/benchmarks/mini_bench_shared.cc | 51 | ||||
-rw-r--r-- | lib/tsan/benchmarks/start_many_threads.cc | 52 | ||||
-rw-r--r-- | lib/tsan/benchmarks/vts_many_threads_bench.cc | 120 |
4 files changed, 272 insertions, 0 deletions
diff --git a/lib/tsan/benchmarks/mini_bench_local.cc b/lib/tsan/benchmarks/mini_bench_local.cc new file mode 100644 index 0000000..accdcb6 --- /dev/null +++ b/lib/tsan/benchmarks/mini_bench_local.cc @@ -0,0 +1,49 @@ +// Mini-benchmark for tsan: non-shared memory writes. +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> + +int len; +int *a; +const int kNumIter = 1000; + +__attribute__((noinline)) +void Run(int idx) { + for (int i = 0, n = len; i < n; i++) + a[i + idx * n] = i; +} + +void *Thread(void *arg) { + long idx = (long)arg; + printf("Thread %ld started\n", idx); + for (int i = 0; i < kNumIter; i++) + Run(idx); + printf("Thread %ld done\n", idx); + return 0; +} + +int main(int argc, char **argv) { + int n_threads = 0; + if (argc != 3) { + n_threads = 4; + len = 1000000; + } else { + n_threads = atoi(argv[1]); + assert(n_threads > 0 && n_threads <= 32); + len = atoi(argv[2]); + } + printf("%s: n_threads=%d len=%d iter=%d\n", + __FILE__, n_threads, len, kNumIter); + a = new int[n_threads * len]; + pthread_t *t = new pthread_t[n_threads]; + for (int i = 0; i < n_threads; i++) { + pthread_create(&t[i], 0, Thread, (void*)i); + } + for (int i = 0; i < n_threads; i++) { + pthread_join(t[i], 0); + } + delete [] t; + delete [] a; + return 0; +} diff --git a/lib/tsan/benchmarks/mini_bench_shared.cc b/lib/tsan/benchmarks/mini_bench_shared.cc new file mode 100644 index 0000000..f9b9f42 --- /dev/null +++ b/lib/tsan/benchmarks/mini_bench_shared.cc @@ -0,0 +1,51 @@ +// Mini-benchmark for tsan: shared memory reads. +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> + +int len; +int *a; +const int kNumIter = 1000; + +__attribute__((noinline)) +void Run(int idx) { + for (int i = 0, n = len; i < n; i++) + if (a[i] != i) abort(); +} + +void *Thread(void *arg) { + long idx = (long)arg; + printf("Thread %ld started\n", idx); + for (int i = 0; i < kNumIter; i++) + Run(idx); + printf("Thread %ld done\n", idx); + return 0; +} + +int main(int argc, char **argv) { + int n_threads = 0; + if (argc != 3) { + n_threads = 4; + len = 1000000; + } else { + n_threads = atoi(argv[1]); + assert(n_threads > 0 && n_threads <= 32); + len = atoi(argv[2]); + } + printf("%s: n_threads=%d len=%d iter=%d\n", + __FILE__, n_threads, len, kNumIter); + a = new int[len]; + for (int i = 0, n = len; i < n; i++) + a[i] = i; + pthread_t *t = new pthread_t[n_threads]; + for (int i = 0; i < n_threads; i++) { + pthread_create(&t[i], 0, Thread, (void*)i); + } + for (int i = 0; i < n_threads; i++) { + pthread_join(t[i], 0); + } + delete [] t; + delete [] a; + return 0; +} diff --git a/lib/tsan/benchmarks/start_many_threads.cc b/lib/tsan/benchmarks/start_many_threads.cc new file mode 100644 index 0000000..1e86fa6 --- /dev/null +++ b/lib/tsan/benchmarks/start_many_threads.cc @@ -0,0 +1,52 @@ +// Mini-benchmark for creating a lot of threads. +// +// Some facts: +// a) clang -O1 takes <15ms to start N=500 threads, +// consuming ~4MB more RAM than N=1. +// b) clang -O1 -ftsan takes ~26s to start N=500 threads, +// eats 5GB more RAM than N=1 (which is somewhat expected but still a lot) +// but then it consumes ~4GB of extra memory when the threads shut down! +// (definitely not in the barrier_wait interceptor) +// Also, it takes 26s to run with N=500 vs just 1.1s to run with N=1. +#include <assert.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +pthread_barrier_t all_threads_ready; + +void* Thread(void *unused) { + pthread_barrier_wait(&all_threads_ready); + return 0; +} + +int main(int argc, char **argv) { + int n_threads; + if (argc == 1) { + n_threads = 100; + } else if (argc == 2) { + n_threads = atoi(argv[1]); + } else { + printf("Usage: %s n_threads\n", argv[0]); + return 1; + } + printf("%s: n_threads=%d\n", __FILE__, n_threads); + + pthread_barrier_init(&all_threads_ready, NULL, n_threads + 1); + + pthread_t *t = new pthread_t[n_threads]; + for (int i = 0; i < n_threads; i++) { + int status = pthread_create(&t[i], 0, Thread, (void*)i); + assert(status == 0); + } + // sleep(5); // FIXME: simplify measuring the memory usage. + pthread_barrier_wait(&all_threads_ready); + for (int i = 0; i < n_threads; i++) { + pthread_join(t[i], 0); + } + // sleep(5); // FIXME: simplify measuring the memory usage. + delete [] t; + + return 0; +} diff --git a/lib/tsan/benchmarks/vts_many_threads_bench.cc b/lib/tsan/benchmarks/vts_many_threads_bench.cc new file mode 100644 index 0000000..f1056e2 --- /dev/null +++ b/lib/tsan/benchmarks/vts_many_threads_bench.cc @@ -0,0 +1,120 @@ +// Mini-benchmark for tsan VTS worst case performance +// Idea: +// 1) Spawn M + N threads (M >> N) +// We'll call the 'M' threads as 'garbage threads'. +// 2) Make sure all threads have created thus no TIDs were reused +// 3) Join the garbage threads +// 4) Do many sync operations on the remaining N threads +// +// It turns out that due to O(M+N) VTS complexity the (4) is much slower with +// when N is large. +// +// Some numbers: +// a) clang++ native O1 with n_iterations=200kk takes +// 5s regardless of M +// clang++ tsanv2 O1 with n_iterations=20kk takes +// 23.5s with M=200 +// 11.5s with M=1 +// i.e. tsanv2 is ~23x to ~47x slower than native, depends on M. +// b) g++ native O1 with n_iterations=200kk takes +// 5.5s regardless of M +// g++ tsanv1 O1 with n_iterations=2kk takes +// 39.5s with M=200 +// 20.5s with M=1 +// i.e. tsanv1 is ~370x to ~720x slower than native, depends on M. + +#include <assert.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> + +class __attribute__((aligned(64))) Mutex { + public: + Mutex() { pthread_mutex_init(&m_, NULL); } + ~Mutex() { pthread_mutex_destroy(&m_); } + void Lock() { pthread_mutex_lock(&m_); } + void Unlock() { pthread_mutex_unlock(&m_); } + + private: + pthread_mutex_t m_; +}; + +const int kNumMutexes = 1024; +Mutex mutexes[kNumMutexes]; + +int n_threads, n_iterations; + +pthread_barrier_t all_threads_ready, main_threads_ready; + +void* GarbageThread(void *unused) { + pthread_barrier_wait(&all_threads_ready); + return 0; +} + +void *Thread(void *arg) { + long idx = (long)arg; + pthread_barrier_wait(&all_threads_ready); + + // Wait for the main thread to join the garbage threads. + pthread_barrier_wait(&main_threads_ready); + + printf("Thread %ld go!\n", idx); + int offset = idx * kNumMutexes / n_threads; + for (int i = 0; i < n_iterations; i++) { + mutexes[(offset + i) % kNumMutexes].Lock(); + mutexes[(offset + i) % kNumMutexes].Unlock(); + } + printf("Thread %ld done\n", idx); + return 0; +} + +int main(int argc, char **argv) { + int n_garbage_threads; + if (argc == 1) { + n_threads = 2; + n_garbage_threads = 200; + n_iterations = 20000000; + } else if (argc == 4) { + n_threads = atoi(argv[1]); + assert(n_threads > 0 && n_threads <= 32); + n_garbage_threads = atoi(argv[2]); + assert(n_garbage_threads > 0 && n_garbage_threads <= 16000); + n_iterations = atoi(argv[3]); + } else { + printf("Usage: %s n_threads n_garbage_threads n_iterations\n", argv[0]); + return 1; + } + printf("%s: n_threads=%d n_garbage_threads=%d n_iterations=%d\n", + __FILE__, n_threads, n_garbage_threads, n_iterations); + + pthread_barrier_init(&all_threads_ready, NULL, n_garbage_threads + n_threads + 1); + pthread_barrier_init(&main_threads_ready, NULL, n_threads + 1); + + pthread_t *t = new pthread_t[n_threads]; + { + pthread_t *g_t = new pthread_t[n_garbage_threads]; + for (int i = 0; i < n_garbage_threads; i++) { + int status = pthread_create(&g_t[i], 0, GarbageThread, NULL); + assert(status == 0); + } + for (int i = 0; i < n_threads; i++) { + int status = pthread_create(&t[i], 0, Thread, (void*)i); + assert(status == 0); + } + pthread_barrier_wait(&all_threads_ready); + printf("All threads started! Killing the garbage threads.\n"); + for (int i = 0; i < n_garbage_threads; i++) { + pthread_join(g_t[i], 0); + } + delete [] g_t; + } + printf("Resuming the main threads.\n"); + pthread_barrier_wait(&main_threads_ready); + + + for (int i = 0; i < n_threads; i++) { + pthread_join(t[i], 0); + } + delete [] t; + return 0; +} |