summaryrefslogtreecommitdiffstats
path: root/lib/tsan/benchmarks
diff options
context:
space:
mode:
Diffstat (limited to 'lib/tsan/benchmarks')
-rw-r--r--lib/tsan/benchmarks/mini_bench_local.cc49
-rw-r--r--lib/tsan/benchmarks/mini_bench_shared.cc51
-rw-r--r--lib/tsan/benchmarks/start_many_threads.cc52
-rw-r--r--lib/tsan/benchmarks/vts_many_threads_bench.cc120
4 files changed, 272 insertions, 0 deletions
diff --git a/lib/tsan/benchmarks/mini_bench_local.cc b/lib/tsan/benchmarks/mini_bench_local.cc
new file mode 100644
index 0000000..accdcb6
--- /dev/null
+++ b/lib/tsan/benchmarks/mini_bench_local.cc
@@ -0,0 +1,49 @@
+// Mini-benchmark for tsan: non-shared memory writes.
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+int len;
+int *a;
+const int kNumIter = 1000;
+
+__attribute__((noinline))
+void Run(int idx) {
+ for (int i = 0, n = len; i < n; i++)
+ a[i + idx * n] = i;
+}
+
+void *Thread(void *arg) {
+ long idx = (long)arg;
+ printf("Thread %ld started\n", idx);
+ for (int i = 0; i < kNumIter; i++)
+ Run(idx);
+ printf("Thread %ld done\n", idx);
+ return 0;
+}
+
+int main(int argc, char **argv) {
+ int n_threads = 0;
+ if (argc != 3) {
+ n_threads = 4;
+ len = 1000000;
+ } else {
+ n_threads = atoi(argv[1]);
+ assert(n_threads > 0 && n_threads <= 32);
+ len = atoi(argv[2]);
+ }
+ printf("%s: n_threads=%d len=%d iter=%d\n",
+ __FILE__, n_threads, len, kNumIter);
+ a = new int[n_threads * len];
+ pthread_t *t = new pthread_t[n_threads];
+ for (int i = 0; i < n_threads; i++) {
+ pthread_create(&t[i], 0, Thread, (void*)i);
+ }
+ for (int i = 0; i < n_threads; i++) {
+ pthread_join(t[i], 0);
+ }
+ delete [] t;
+ delete [] a;
+ return 0;
+}
diff --git a/lib/tsan/benchmarks/mini_bench_shared.cc b/lib/tsan/benchmarks/mini_bench_shared.cc
new file mode 100644
index 0000000..f9b9f42
--- /dev/null
+++ b/lib/tsan/benchmarks/mini_bench_shared.cc
@@ -0,0 +1,51 @@
+// Mini-benchmark for tsan: shared memory reads.
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+int len;
+int *a;
+const int kNumIter = 1000;
+
+__attribute__((noinline))
+void Run(int idx) {
+ for (int i = 0, n = len; i < n; i++)
+ if (a[i] != i) abort();
+}
+
+void *Thread(void *arg) {
+ long idx = (long)arg;
+ printf("Thread %ld started\n", idx);
+ for (int i = 0; i < kNumIter; i++)
+ Run(idx);
+ printf("Thread %ld done\n", idx);
+ return 0;
+}
+
+int main(int argc, char **argv) {
+ int n_threads = 0;
+ if (argc != 3) {
+ n_threads = 4;
+ len = 1000000;
+ } else {
+ n_threads = atoi(argv[1]);
+ assert(n_threads > 0 && n_threads <= 32);
+ len = atoi(argv[2]);
+ }
+ printf("%s: n_threads=%d len=%d iter=%d\n",
+ __FILE__, n_threads, len, kNumIter);
+ a = new int[len];
+ for (int i = 0, n = len; i < n; i++)
+ a[i] = i;
+ pthread_t *t = new pthread_t[n_threads];
+ for (int i = 0; i < n_threads; i++) {
+ pthread_create(&t[i], 0, Thread, (void*)i);
+ }
+ for (int i = 0; i < n_threads; i++) {
+ pthread_join(t[i], 0);
+ }
+ delete [] t;
+ delete [] a;
+ return 0;
+}
diff --git a/lib/tsan/benchmarks/start_many_threads.cc b/lib/tsan/benchmarks/start_many_threads.cc
new file mode 100644
index 0000000..1e86fa6
--- /dev/null
+++ b/lib/tsan/benchmarks/start_many_threads.cc
@@ -0,0 +1,52 @@
+// Mini-benchmark for creating a lot of threads.
+//
+// Some facts:
+// a) clang -O1 takes <15ms to start N=500 threads,
+// consuming ~4MB more RAM than N=1.
+// b) clang -O1 -ftsan takes ~26s to start N=500 threads,
+// eats 5GB more RAM than N=1 (which is somewhat expected but still a lot)
+// but then it consumes ~4GB of extra memory when the threads shut down!
+// (definitely not in the barrier_wait interceptor)
+// Also, it takes 26s to run with N=500 vs just 1.1s to run with N=1.
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+pthread_barrier_t all_threads_ready;
+
+void* Thread(void *unused) {
+ pthread_barrier_wait(&all_threads_ready);
+ return 0;
+}
+
+int main(int argc, char **argv) {
+ int n_threads;
+ if (argc == 1) {
+ n_threads = 100;
+ } else if (argc == 2) {
+ n_threads = atoi(argv[1]);
+ } else {
+ printf("Usage: %s n_threads\n", argv[0]);
+ return 1;
+ }
+ printf("%s: n_threads=%d\n", __FILE__, n_threads);
+
+ pthread_barrier_init(&all_threads_ready, NULL, n_threads + 1);
+
+ pthread_t *t = new pthread_t[n_threads];
+ for (int i = 0; i < n_threads; i++) {
+ int status = pthread_create(&t[i], 0, Thread, (void*)i);
+ assert(status == 0);
+ }
+ // sleep(5); // FIXME: simplify measuring the memory usage.
+ pthread_barrier_wait(&all_threads_ready);
+ for (int i = 0; i < n_threads; i++) {
+ pthread_join(t[i], 0);
+ }
+ // sleep(5); // FIXME: simplify measuring the memory usage.
+ delete [] t;
+
+ return 0;
+}
diff --git a/lib/tsan/benchmarks/vts_many_threads_bench.cc b/lib/tsan/benchmarks/vts_many_threads_bench.cc
new file mode 100644
index 0000000..f1056e2
--- /dev/null
+++ b/lib/tsan/benchmarks/vts_many_threads_bench.cc
@@ -0,0 +1,120 @@
+// Mini-benchmark for tsan VTS worst case performance
+// Idea:
+// 1) Spawn M + N threads (M >> N)
+// We'll call the 'M' threads as 'garbage threads'.
+// 2) Make sure all threads have created thus no TIDs were reused
+// 3) Join the garbage threads
+// 4) Do many sync operations on the remaining N threads
+//
+// It turns out that due to O(M+N) VTS complexity the (4) is much slower with
+// when N is large.
+//
+// Some numbers:
+// a) clang++ native O1 with n_iterations=200kk takes
+// 5s regardless of M
+// clang++ tsanv2 O1 with n_iterations=20kk takes
+// 23.5s with M=200
+// 11.5s with M=1
+// i.e. tsanv2 is ~23x to ~47x slower than native, depends on M.
+// b) g++ native O1 with n_iterations=200kk takes
+// 5.5s regardless of M
+// g++ tsanv1 O1 with n_iterations=2kk takes
+// 39.5s with M=200
+// 20.5s with M=1
+// i.e. tsanv1 is ~370x to ~720x slower than native, depends on M.
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+class __attribute__((aligned(64))) Mutex {
+ public:
+ Mutex() { pthread_mutex_init(&m_, NULL); }
+ ~Mutex() { pthread_mutex_destroy(&m_); }
+ void Lock() { pthread_mutex_lock(&m_); }
+ void Unlock() { pthread_mutex_unlock(&m_); }
+
+ private:
+ pthread_mutex_t m_;
+};
+
+const int kNumMutexes = 1024;
+Mutex mutexes[kNumMutexes];
+
+int n_threads, n_iterations;
+
+pthread_barrier_t all_threads_ready, main_threads_ready;
+
+void* GarbageThread(void *unused) {
+ pthread_barrier_wait(&all_threads_ready);
+ return 0;
+}
+
+void *Thread(void *arg) {
+ long idx = (long)arg;
+ pthread_barrier_wait(&all_threads_ready);
+
+ // Wait for the main thread to join the garbage threads.
+ pthread_barrier_wait(&main_threads_ready);
+
+ printf("Thread %ld go!\n", idx);
+ int offset = idx * kNumMutexes / n_threads;
+ for (int i = 0; i < n_iterations; i++) {
+ mutexes[(offset + i) % kNumMutexes].Lock();
+ mutexes[(offset + i) % kNumMutexes].Unlock();
+ }
+ printf("Thread %ld done\n", idx);
+ return 0;
+}
+
+int main(int argc, char **argv) {
+ int n_garbage_threads;
+ if (argc == 1) {
+ n_threads = 2;
+ n_garbage_threads = 200;
+ n_iterations = 20000000;
+ } else if (argc == 4) {
+ n_threads = atoi(argv[1]);
+ assert(n_threads > 0 && n_threads <= 32);
+ n_garbage_threads = atoi(argv[2]);
+ assert(n_garbage_threads > 0 && n_garbage_threads <= 16000);
+ n_iterations = atoi(argv[3]);
+ } else {
+ printf("Usage: %s n_threads n_garbage_threads n_iterations\n", argv[0]);
+ return 1;
+ }
+ printf("%s: n_threads=%d n_garbage_threads=%d n_iterations=%d\n",
+ __FILE__, n_threads, n_garbage_threads, n_iterations);
+
+ pthread_barrier_init(&all_threads_ready, NULL, n_garbage_threads + n_threads + 1);
+ pthread_barrier_init(&main_threads_ready, NULL, n_threads + 1);
+
+ pthread_t *t = new pthread_t[n_threads];
+ {
+ pthread_t *g_t = new pthread_t[n_garbage_threads];
+ for (int i = 0; i < n_garbage_threads; i++) {
+ int status = pthread_create(&g_t[i], 0, GarbageThread, NULL);
+ assert(status == 0);
+ }
+ for (int i = 0; i < n_threads; i++) {
+ int status = pthread_create(&t[i], 0, Thread, (void*)i);
+ assert(status == 0);
+ }
+ pthread_barrier_wait(&all_threads_ready);
+ printf("All threads started! Killing the garbage threads.\n");
+ for (int i = 0; i < n_garbage_threads; i++) {
+ pthread_join(g_t[i], 0);
+ }
+ delete [] g_t;
+ }
+ printf("Resuming the main threads.\n");
+ pthread_barrier_wait(&main_threads_ready);
+
+
+ for (int i = 0; i < n_threads; i++) {
+ pthread_join(t[i], 0);
+ }
+ delete [] t;
+ return 0;
+}
OpenPOWER on IntegriCloud