summaryrefslogtreecommitdiffstats
path: root/contrib/compiler-rt/lib/esan/working_set.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/compiler-rt/lib/esan/working_set.cpp')
-rw-r--r--contrib/compiler-rt/lib/esan/working_set.cpp279
1 files changed, 279 insertions, 0 deletions
diff --git a/contrib/compiler-rt/lib/esan/working_set.cpp b/contrib/compiler-rt/lib/esan/working_set.cpp
new file mode 100644
index 0000000..f391119
--- /dev/null
+++ b/contrib/compiler-rt/lib/esan/working_set.cpp
@@ -0,0 +1,279 @@
+//===-- working_set.cpp ---------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// This file contains working-set-specific code.
+//===----------------------------------------------------------------------===//
+
+#include "working_set.h"
+#include "esan.h"
+#include "esan_circular_buffer.h"
+#include "esan_flags.h"
+#include "esan_shadow.h"
+#include "esan_sideline.h"
+#include "sanitizer_common/sanitizer_procmaps.h"
+
+// We shadow every cache line of app memory with one shadow byte.
+// - The highest bit of each shadow byte indicates whether the corresponding
+// cache line has ever been accessed.
+// - The lowest bit of each shadow byte indicates whether the corresponding
+// cache line was accessed since the last sample.
+// - The other bits are used for working set snapshots at successively
+// lower frequencies, each bit to the left from the lowest bit stepping
+// down the frequency by 2 to the power of getFlags()->snapshot_step.
+// Thus we have something like this:
+// Bit 0: Since last sample
+// Bit 1: Since last 2^2 samples
+// Bit 2: Since last 2^4 samples
+// Bit 3: ...
+// Bit 7: Ever accessed.
+// We live with races in accessing each shadow byte.
+typedef unsigned char byte;
+
+namespace __esan {
+
+// Our shadow memory assumes that the line size is 64.
+static const u32 CacheLineSize = 64;
+
+// See the shadow byte layout description above.
+static const u32 TotalWorkingSetBitIdx = 7;
+// We accumulate to the left until we hit this bit.
+// We don't need to accumulate to the final bit as it's set on each ref
+// by the compiler instrumentation.
+static const u32 MaxAccumBitIdx = 6;
+static const u32 CurWorkingSetBitIdx = 0;
+static const byte ShadowAccessedVal =
+ (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx);
+
+static SidelineThread Thread;
+// If we use real-time-based timer samples this won't overflow in any realistic
+// scenario, but if we switch to some other unit (such as memory accesses) we
+// may want to consider a 64-bit int.
+static u32 SnapshotNum;
+
+// We store the wset size for each of 8 different sampling frequencies.
+static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
+// We cannot use static objects as the global destructor is called
+// prior to our finalize routine.
+// These are each circular buffers, sized up front.
+CircularBuffer<u32> SizePerFreq[NumFreq];
+// We cannot rely on static initializers (they may run too late) but
+// we record the size here for clarity:
+u32 CircularBufferSizes[NumFreq] = {
+ // These are each mmap-ed so our minimum is one page.
+ 32*1024,
+ 16*1024,
+ 8*1024,
+ 4*1024,
+ 4*1024,
+ 4*1024,
+ 4*1024,
+ 4*1024,
+};
+
+void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
+ bool IsWrite) {
+ if (Size == 0)
+ return;
+ SIZE_T I = 0;
+ uptr LineSize = getFlags()->cache_line_size;
+ // As Addr+Size could overflow at the top of a 32-bit address space,
+ // we avoid the simpler formula that rounds the start and end.
+ SIZE_T NumLines = Size / LineSize +
+ // Add any extra at the start or end adding on an extra line:
+ (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
+ byte *Shadow = (byte *)appToShadow(Addr);
+ // Write shadow bytes until we're word-aligned.
+ while (I < NumLines && (uptr)Shadow % 4 != 0) {
+ if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
+ *Shadow |= ShadowAccessedVal;
+ ++Shadow;
+ ++I;
+ }
+ // Write whole shadow words at a time.
+ // Using a word-stride loop improves the runtime of a microbenchmark of
+ // memset calls by 10%.
+ u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 |
+ ShadowAccessedVal << 16 | ShadowAccessedVal << 24;
+ while (I + 4 <= NumLines) {
+ if ((*(u32*)Shadow & WordValue) != WordValue)
+ *(u32*)Shadow |= WordValue;
+ Shadow += 4;
+ I += 4;
+ }
+ // Write any trailing shadow bytes.
+ while (I < NumLines) {
+ if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
+ *Shadow |= ShadowAccessedVal;
+ ++Shadow;
+ ++I;
+ }
+}
+
+// This routine will word-align ShadowStart and ShadowEnd prior to scanning.
+// It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
+// measures the access during the entire execution and should never be cleared.
+static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
+ uptr ShadowEnd) {
+ u32 WorkingSetSize = 0;
+ u32 ByteValue = 0x1 << BitIdx;
+ u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 |
+ ByteValue << 24;
+ // Get word aligned start.
+ ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
+ bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
+ // Do not clear the bit that measures access during the entire execution.
+ bool Clear = BitIdx < TotalWorkingSetBitIdx;
+ for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
+ if ((*Ptr & WordValue) != 0) {
+ byte *BytePtr = (byte *)Ptr;
+ for (u32 j = 0; j < sizeof(u32); ++j) {
+ if (BytePtr[j] & ByteValue) {
+ ++WorkingSetSize;
+ if (Accum) {
+ // Accumulate to the lower-frequency bit to the left.
+ BytePtr[j] |= (ByteValue << 1);
+ }
+ }
+ }
+ if (Clear) {
+ // Clear this bit from every shadow byte.
+ *Ptr &= ~WordValue;
+ }
+ }
+ }
+ return WorkingSetSize;
+}
+
+// Scan shadow memory to calculate the number of cache lines being accessed,
+// i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
+// We also clear the lowest bits (most recent working set snapshot).
+// We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
+// measures the access during the entire execution and should never be cleared.
+static u32 computeWorkingSizeAndReset(u32 BitIdx) {
+ u32 WorkingSetSize = 0;
+ MemoryMappingLayout MemIter(true/*cache*/);
+ uptr Start, End, Prot;
+ while (MemIter.Next(&Start, &End, nullptr/*offs*/, nullptr/*file*/,
+ 0/*file size*/, &Prot)) {
+ VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n",
+ __FUNCTION__, Start, End, Prot, isAppMem(Start),
+ isShadowMem(Start));
+ if (isShadowMem(Start) && (Prot & MemoryMappingLayout::kProtectionWrite)) {
+ VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Start, End);
+ WorkingSetSize += countAndClearShadowValues(BitIdx, Start, End);
+ }
+ }
+ return WorkingSetSize;
+}
+
+// This is invoked from a signal handler but in a sideline thread doing nothing
+// else so it is a little less fragile than a typical signal handler.
+static void takeSample(void *Arg) {
+ u32 BitIdx = CurWorkingSetBitIdx;
+ u32 Freq = 1;
+ ++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
+ while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
+ u32 NumLines = computeWorkingSizeAndReset(BitIdx);
+ VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
+ SnapshotNum, BitIdx, Freq, NumLines);
+ SizePerFreq[BitIdx].push_back(NumLines);
+ Freq = Freq << getFlags()->snapshot_step;
+ BitIdx++;
+ }
+}
+
+unsigned int getSampleCountWorkingSet()
+{
+ return SnapshotNum;
+}
+
+// Initialization that must be done before any instrumented code is executed.
+void initializeShadowWorkingSet() {
+ CHECK(getFlags()->cache_line_size == CacheLineSize);
+ registerMemoryFaultHandler();
+}
+
+void initializeWorkingSet() {
+ if (getFlags()->record_snapshots) {
+ for (u32 i = 0; i < NumFreq; ++i)
+ SizePerFreq[i].initialize(CircularBufferSizes[i]);
+ Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
+ }
+}
+
+static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
+ if (MilliSec > 600000) {
+ Unit = "min";
+ return MilliSec / 60000;
+ } else if (MilliSec > 10000) {
+ Unit = "sec";
+ return MilliSec / 1000;
+ } else {
+ Unit = "ms";
+ return MilliSec;
+ }
+}
+
+static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
+ // We need a constant to avoid software divide support:
+ static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
+ static const u32 MegabyteCachelines = KilobyteCachelines << 10;
+
+ if (NumOfCachelines > 10 * MegabyteCachelines) {
+ Unit = "MB";
+ return NumOfCachelines / MegabyteCachelines;
+ } else if (NumOfCachelines > 10 * KilobyteCachelines) {
+ Unit = "KB";
+ return NumOfCachelines / KilobyteCachelines;
+ } else {
+ Unit = "Bytes";
+ return NumOfCachelines * CacheLineSize;
+ }
+}
+
+void reportWorkingSet() {
+ const char *Unit;
+ if (getFlags()->record_snapshots) {
+ u32 Freq = 1;
+ Report(" Total number of samples: %u\n", SnapshotNum);
+ for (u32 i = 0; i < NumFreq; ++i) {
+ u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
+ Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
+ // FIXME: report whether we wrapped around and thus whether we
+ // have data on the whole run or just the last N samples.
+ for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
+ u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
+ Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
+ SizePerFreq[i][j]);
+ }
+ Freq = Freq << getFlags()->snapshot_step;
+ }
+ }
+
+ // Get the working set size for the entire execution.
+ u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
+ u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
+ Report(" %s: the total working set size: %u %s (%u cache lines)\n",
+ SanitizerToolName, Size, Unit, NumOfCachelines);
+}
+
+int finalizeWorkingSet() {
+ if (getFlags()->record_snapshots)
+ Thread.joinThread();
+ reportWorkingSet();
+ if (getFlags()->record_snapshots) {
+ for (u32 i = 0; i < NumFreq; ++i)
+ SizePerFreq[i].free();
+ }
+ return 0;
+}
+
+} // namespace __esan
OpenPOWER on IntegriCloud