diff options
Diffstat (limited to 'libsanitizer/esan/working_set.cpp')
-rw-r--r-- | libsanitizer/esan/working_set.cpp | 281 |
1 files changed, 281 insertions, 0 deletions
diff --git a/libsanitizer/esan/working_set.cpp b/libsanitizer/esan/working_set.cpp new file mode 100644 index 00000000000..3082ead074c --- /dev/null +++ b/libsanitizer/esan/working_set.cpp @@ -0,0 +1,281 @@ +//===-- working_set.cpp ---------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of EfficiencySanitizer, a family of performance tuners. +// +// This file contains working-set-specific code. +//===----------------------------------------------------------------------===// + +#include "working_set.h" +#include "esan.h" +#include "esan_circular_buffer.h" +#include "esan_flags.h" +#include "esan_shadow.h" +#include "esan_sideline.h" +#include "sanitizer_common/sanitizer_procmaps.h" + +// We shadow every cache line of app memory with one shadow byte. +// - The highest bit of each shadow byte indicates whether the corresponding +// cache line has ever been accessed. +// - The lowest bit of each shadow byte indicates whether the corresponding +// cache line was accessed since the last sample. +// - The other bits are used for working set snapshots at successively +// lower frequencies, each bit to the left from the lowest bit stepping +// down the frequency by 2 to the power of getFlags()->snapshot_step. +// Thus we have something like this: +// Bit 0: Since last sample +// Bit 1: Since last 2^2 samples +// Bit 2: Since last 2^4 samples +// Bit 3: ... +// Bit 7: Ever accessed. +// We live with races in accessing each shadow byte. +typedef unsigned char byte; + +namespace __esan { + +// Our shadow memory assumes that the line size is 64. +static const u32 CacheLineSize = 64; + +// See the shadow byte layout description above. +static const u32 TotalWorkingSetBitIdx = 7; +// We accumulate to the left until we hit this bit. +// We don't need to accumulate to the final bit as it's set on each ref +// by the compiler instrumentation. +static const u32 MaxAccumBitIdx = 6; +static const u32 CurWorkingSetBitIdx = 0; +static const byte ShadowAccessedVal = + (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx); + +static SidelineThread Thread; +// If we use real-time-based timer samples this won't overflow in any realistic +// scenario, but if we switch to some other unit (such as memory accesses) we +// may want to consider a 64-bit int. +static u32 SnapshotNum; + +// We store the wset size for each of 8 different sampling frequencies. +static const u32 NumFreq = 8; // One for each bit of our shadow bytes. +// We cannot use static objects as the global destructor is called +// prior to our finalize routine. +// These are each circular buffers, sized up front. +CircularBuffer<u32> SizePerFreq[NumFreq]; +// We cannot rely on static initializers (they may run too late) but +// we record the size here for clarity: +u32 CircularBufferSizes[NumFreq] = { + // These are each mmap-ed so our minimum is one page. + 32*1024, + 16*1024, + 8*1024, + 4*1024, + 4*1024, + 4*1024, + 4*1024, + 4*1024, +}; + +void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size, + bool IsWrite) { + if (Size == 0) + return; + SIZE_T I = 0; + uptr LineSize = getFlags()->cache_line_size; + // As Addr+Size could overflow at the top of a 32-bit address space, + // we avoid the simpler formula that rounds the start and end. + SIZE_T NumLines = Size / LineSize + + // Add any extra at the start or end adding on an extra line: + (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize; + byte *Shadow = (byte *)appToShadow(Addr); + // Write shadow bytes until we're word-aligned. + while (I < NumLines && (uptr)Shadow % 4 != 0) { + if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal) + *Shadow |= ShadowAccessedVal; + ++Shadow; + ++I; + } + // Write whole shadow words at a time. + // Using a word-stride loop improves the runtime of a microbenchmark of + // memset calls by 10%. + u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 | + ShadowAccessedVal << 16 | ShadowAccessedVal << 24; + while (I + 4 <= NumLines) { + if ((*(u32*)Shadow & WordValue) != WordValue) + *(u32*)Shadow |= WordValue; + Shadow += 4; + I += 4; + } + // Write any trailing shadow bytes. + while (I < NumLines) { + if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal) + *Shadow |= ShadowAccessedVal; + ++Shadow; + ++I; + } +} + +// This routine will word-align ShadowStart and ShadowEnd prior to scanning. +// It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit +// measures the access during the entire execution and should never be cleared. +static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart, + uptr ShadowEnd) { + u32 WorkingSetSize = 0; + u32 ByteValue = 0x1 << BitIdx; + u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 | + ByteValue << 24; + // Get word aligned start. + ShadowStart = RoundDownTo(ShadowStart, sizeof(u32)); + bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx; + // Do not clear the bit that measures access during the entire execution. + bool Clear = BitIdx < TotalWorkingSetBitIdx; + for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) { + if ((*Ptr & WordValue) != 0) { + byte *BytePtr = (byte *)Ptr; + for (u32 j = 0; j < sizeof(u32); ++j) { + if (BytePtr[j] & ByteValue) { + ++WorkingSetSize; + if (Accum) { + // Accumulate to the lower-frequency bit to the left. + BytePtr[j] |= (ByteValue << 1); + } + } + } + if (Clear) { + // Clear this bit from every shadow byte. + *Ptr &= ~WordValue; + } + } + } + return WorkingSetSize; +} + +// Scan shadow memory to calculate the number of cache lines being accessed, +// i.e., the number of non-zero bits indexed by BitIdx in each shadow byte. +// We also clear the lowest bits (most recent working set snapshot). +// We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit +// measures the access during the entire execution and should never be cleared. +static u32 computeWorkingSizeAndReset(u32 BitIdx) { + u32 WorkingSetSize = 0; + MemoryMappingLayout MemIter(true/*cache*/); + uptr Start, End, Prot; + while (MemIter.Next(&Start, &End, nullptr/*offs*/, nullptr/*file*/, + 0/*file size*/, &Prot)) { + VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n", + __FUNCTION__, Start, End, Prot, isAppMem(Start), + isShadowMem(Start)); + if (isShadowMem(Start) && (Prot & MemoryMappingLayout::kProtectionWrite)) { + VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Start, End); + WorkingSetSize += countAndClearShadowValues(BitIdx, Start, End); + } + } + return WorkingSetSize; +} + +// This is invoked from a signal handler but in a sideline thread doing nothing +// else so it is a little less fragile than a typical signal handler. +static void takeSample(void *Arg) { + u32 BitIdx = CurWorkingSetBitIdx; + u32 Freq = 1; + ++SnapshotNum; // Simpler to skip 0 whose mod matches everything. + while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) { + u32 NumLines = computeWorkingSizeAndReset(BitIdx); + VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName, + SnapshotNum, BitIdx, Freq, NumLines); + SizePerFreq[BitIdx].push_back(NumLines); + Freq = Freq << getFlags()->snapshot_step; + BitIdx++; + } +} + +unsigned int getSampleCountWorkingSet() +{ + return SnapshotNum; +} + +// Initialization that must be done before any instrumented code is executed. +void initializeShadowWorkingSet() { + CHECK(getFlags()->cache_line_size == CacheLineSize); +#if SANITIZER_LINUX && (defined(__x86_64__) || SANITIZER_MIPS64) + registerMemoryFaultHandler(); +#endif +} + +void initializeWorkingSet() { + if (getFlags()->record_snapshots) { + for (u32 i = 0; i < NumFreq; ++i) + SizePerFreq[i].initialize(CircularBufferSizes[i]); + Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq); + } +} + +static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) { + if (MilliSec > 600000) { + Unit = "min"; + return MilliSec / 60000; + } else if (MilliSec > 10000) { + Unit = "sec"; + return MilliSec / 1000; + } else { + Unit = "ms"; + return MilliSec; + } +} + +static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) { + // We need a constant to avoid software divide support: + static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize; + static const u32 MegabyteCachelines = KilobyteCachelines << 10; + + if (NumOfCachelines > 10 * MegabyteCachelines) { + Unit = "MB"; + return NumOfCachelines / MegabyteCachelines; + } else if (NumOfCachelines > 10 * KilobyteCachelines) { + Unit = "KB"; + return NumOfCachelines / KilobyteCachelines; + } else { + Unit = "Bytes"; + return NumOfCachelines * CacheLineSize; + } +} + +void reportWorkingSet() { + const char *Unit; + if (getFlags()->record_snapshots) { + u32 Freq = 1; + Report(" Total number of samples: %u\n", SnapshotNum); + for (u32 i = 0; i < NumFreq; ++i) { + u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit); + Report(" Samples array #%d at period %u %s\n", i, Time, Unit); + // FIXME: report whether we wrapped around and thus whether we + // have data on the whole run or just the last N samples. + for (u32 j = 0; j < SizePerFreq[i].size(); ++j) { + u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit); + Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit, + SizePerFreq[i][j]); + } + Freq = Freq << getFlags()->snapshot_step; + } + } + + // Get the working set size for the entire execution. + u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx); + u32 Size = getSizeForPrinting(NumOfCachelines, Unit); + Report(" %s: the total working set size: %u %s (%u cache lines)\n", + SanitizerToolName, Size, Unit, NumOfCachelines); +} + +int finalizeWorkingSet() { + if (getFlags()->record_snapshots) + Thread.joinThread(); + reportWorkingSet(); + if (getFlags()->record_snapshots) { + for (u32 i = 0; i < NumFreq; ++i) + SizePerFreq[i].free(); + } + return 0; +} + +} // namespace __esan |