diff options
Diffstat (limited to 'contrib/compiler-rt/lib/xray')
35 files changed, 3151 insertions, 358 deletions
diff --git a/contrib/compiler-rt/lib/xray/xray_AArch64.cc b/contrib/compiler-rt/lib/xray/xray_AArch64.cc index 0c1df22..f26e77d 100644 --- a/contrib/compiler-rt/lib/xray/xray_AArch64.cc +++ b/contrib/compiler-rt/lib/xray/xray_AArch64.cc @@ -14,29 +14,14 @@ //===----------------------------------------------------------------------===// #include "sanitizer_common/sanitizer_common.h" #include "xray_defs.h" -#include "xray_emulate_tsc.h" #include "xray_interface_internal.h" #include <atomic> #include <cassert> - -extern "C" void __clear_cache(void* start, void* end); +extern "C" void __clear_cache(void *start, void *end); namespace __xray { -uint64_t cycleFrequency() XRAY_NEVER_INSTRUMENT { - // There is no instruction like RDTSCP in user mode on ARM. ARM's CP15 does - // not have a constant frequency like TSC on x86[_64]; it may go faster or - // slower depending on CPU's turbo or power saving modes. Furthermore, to - // read from CP15 on ARM a kernel modification or a driver is needed. - // We can not require this from users of compiler-rt. - // So on ARM we use clock_gettime(2) which gives the result in nanoseconds. - // To get the measurements per second, we scale this by the number of - // nanoseconds per second, pretending that the TSC frequency is 1GHz and - // one TSC tick is 1 nanosecond. - return NanosecondsPerSecond; -} - // The machine codes for some instructions used in runtime patching. enum class PatchOpcodes : uint32_t { PO_StpX0X30SP_m16e = 0xA9BF7BE0, // STP X0, X30, [SP, #-16]! @@ -100,14 +85,15 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId, reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress), uint32_t(PatchOpcodes::PO_B32), std::memory_order_release); } - __clear_cache(reinterpret_cast<char*>(FirstAddress), - reinterpret_cast<char*>(CurAddress)); + __clear_cache(reinterpret_cast<char *>(FirstAddress), + reinterpret_cast<char *>(CurAddress)); return true; } bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { - return patchSled(Enable, FuncId, Sled, __xray_FunctionEntry); + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + return patchSled(Enable, FuncId, Sled, Trampoline); } bool patchFunctionExit(const bool Enable, const uint32_t FuncId, @@ -117,9 +103,20 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId, bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { - // FIXME: In the future we'd need to distinguish between non-tail exits and - // tail exits for better information preservation. - return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); + return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit); } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) + XRAY_NEVER_INSTRUMENT { // FIXME: Implement in aarch64? + return false; +} + +// FIXME: Maybe implement this better? +bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } + } // namespace __xray + +extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { + // FIXME: this will have to be implemented in the trampoline assembly file +} diff --git a/contrib/compiler-rt/lib/xray/xray_always_instrument.txt b/contrib/compiler-rt/lib/xray/xray_always_instrument.txt new file mode 100644 index 0000000..151ed70 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_always_instrument.txt @@ -0,0 +1,6 @@ +# List of function matchers common to C/C++ applications that make sense to +# always instrument. You can use this as an argument to +# -fxray-always-instrument=<path> along with your project-specific lists. + +# Always instrument the main function. +fun:main diff --git a/contrib/compiler-rt/lib/xray/xray_arm.cc b/contrib/compiler-rt/lib/xray/xray_arm.cc index f5e2cd2..da4efcd 100644 --- a/contrib/compiler-rt/lib/xray/xray_arm.cc +++ b/contrib/compiler-rt/lib/xray/xray_arm.cc @@ -14,28 +14,14 @@ //===----------------------------------------------------------------------===// #include "sanitizer_common/sanitizer_common.h" #include "xray_defs.h" -#include "xray_emulate_tsc.h" #include "xray_interface_internal.h" #include <atomic> #include <cassert> -extern "C" void __clear_cache(void* start, void* end); +extern "C" void __clear_cache(void *start, void *end); namespace __xray { -uint64_t cycleFrequency() XRAY_NEVER_INSTRUMENT { - // There is no instruction like RDTSCP in user mode on ARM. ARM's CP15 does - // not have a constant frequency like TSC on x86[_64]; it may go faster or - // slower depending on CPU's turbo or power saving modes. Furthermore, to - // read from CP15 on ARM a kernel modification or a driver is needed. - // We can not require this from users of compiler-rt. - // So on ARM we use clock_gettime(2) which gives the result in nanoseconds. - // To get the measurements per second, we scale this by the number of - // nanoseconds per second, pretending that the TSC frequency is 1GHz and - // one TSC tick is 1 nanosecond. - return NanosecondsPerSecond; -} - // The machine codes for some instructions used in runtime patching. enum class PatchOpcodes : uint32_t { PO_PushR0Lr = 0xE92D4001, // PUSH {r0, lr} @@ -74,7 +60,7 @@ write32bitLoadReg(uint8_t regNo, uint32_t *Address, // MOVW r0, #<lower 16 bits of the |Value|> // MOVT r0, #<higher 16 bits of the |Value|> inline static uint32_t * -Write32bitLoadR0(uint32_t *Address, +write32bitLoadR0(uint32_t *Address, const uint32_t Value) XRAY_NEVER_INSTRUMENT { return write32bitLoadReg(0, Address, Value); } @@ -83,7 +69,7 @@ Write32bitLoadR0(uint32_t *Address, // MOVW ip, #<lower 16 bits of the |Value|> // MOVT ip, #<higher 16 bits of the |Value|> inline static uint32_t * -Write32bitLoadIP(uint32_t *Address, +write32bitLoadIP(uint32_t *Address, const uint32_t Value) XRAY_NEVER_INSTRUMENT { return write32bitLoadReg(12, Address, Value); } @@ -121,9 +107,9 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId, uint32_t *CurAddress = FirstAddress + 1; if (Enable) { CurAddress = - Write32bitLoadR0(CurAddress, reinterpret_cast<uint32_t>(FuncId)); + write32bitLoadR0(CurAddress, reinterpret_cast<uint32_t>(FuncId)); CurAddress = - Write32bitLoadIP(CurAddress, reinterpret_cast<uint32_t>(TracingHook)); + write32bitLoadIP(CurAddress, reinterpret_cast<uint32_t>(TracingHook)); *CurAddress = uint32_t(PatchOpcodes::PO_BlxIp); CurAddress++; *CurAddress = uint32_t(PatchOpcodes::PO_PopR0Lr); @@ -136,14 +122,15 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId, reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress), uint32_t(PatchOpcodes::PO_B20), std::memory_order_release); } - __clear_cache(reinterpret_cast<char*>(FirstAddress), - reinterpret_cast<char*>(CurAddress)); + __clear_cache(reinterpret_cast<char *>(FirstAddress), + reinterpret_cast<char *>(CurAddress)); return true; } bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { - return patchSled(Enable, FuncId, Sled, __xray_FunctionEntry); + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + return patchSled(Enable, FuncId, Sled, Trampoline); } bool patchFunctionExit(const bool Enable, const uint32_t FuncId, @@ -153,9 +140,20 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId, bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { - // FIXME: In the future we'd need to distinguish between non-tail exits and - // tail exits for better information preservation. - return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); + return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit); +} + +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) + XRAY_NEVER_INSTRUMENT { // FIXME: Implement in arm? + return false; } +// FIXME: Maybe implement this better? +bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } + } // namespace __xray + +extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { + // FIXME: this will have to be implemented in the trampoline assembly file +} diff --git a/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc b/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc index 7e5462f..7ba755a 100644 --- a/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc +++ b/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc @@ -13,53 +13,69 @@ // //===----------------------------------------------------------------------===// #include "xray_buffer_queue.h" -#include <cassert> +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_libc.h" + #include <cstdlib> +#include <tuple> using namespace __xray; +using namespace __sanitizer; -BufferQueue::BufferQueue(std::size_t B, std::size_t N) - : BufferSize(B), Buffers(N), Mutex(), OwnedBuffers(), Finalizing(false) { - for (auto &Buf : Buffers) { +BufferQueue::BufferQueue(std::size_t B, std::size_t N, bool &Success) + : BufferSize(B), Buffers(N), Mutex(), OwnedBuffers(), Finalizing{0} { + for (auto &T : Buffers) { void *Tmp = malloc(BufferSize); + if (Tmp == nullptr) { + Success = false; + return; + } + + auto &Buf = std::get<0>(T); Buf.Buffer = Tmp; Buf.Size = B; - if (Tmp != 0) - OwnedBuffers.insert(Tmp); + OwnedBuffers.emplace(Tmp); } + Success = true; } -std::error_code BufferQueue::getBuffer(Buffer &Buf) { - if (Finalizing.load(std::memory_order_acquire)) - return std::make_error_code(std::errc::state_not_recoverable); - std::lock_guard<std::mutex> Guard(Mutex); +BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) { + if (__sanitizer::atomic_load(&Finalizing, __sanitizer::memory_order_acquire)) + return ErrorCode::QueueFinalizing; + __sanitizer::BlockingMutexLock Guard(&Mutex); if (Buffers.empty()) - return std::make_error_code(std::errc::not_enough_memory); - Buf = Buffers.front(); + return ErrorCode::NotEnoughMemory; + auto &T = Buffers.front(); + auto &B = std::get<0>(T); + Buf = B; + B.Buffer = nullptr; + B.Size = 0; Buffers.pop_front(); - return {}; + return ErrorCode::Ok; } -std::error_code BufferQueue::releaseBuffer(Buffer &Buf) { +BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) { if (OwnedBuffers.count(Buf.Buffer) == 0) - return std::make_error_code(std::errc::argument_out_of_domain); - std::lock_guard<std::mutex> Guard(Mutex); - Buffers.push_back(Buf); + return ErrorCode::UnrecognizedBuffer; + __sanitizer::BlockingMutexLock Guard(&Mutex); + + // Now that the buffer has been released, we mark it as "used". + Buffers.emplace(Buffers.end(), Buf, true /* used */); Buf.Buffer = nullptr; - Buf.Size = BufferSize; - return {}; + Buf.Size = 0; + return ErrorCode::Ok; } -std::error_code BufferQueue::finalize() { - if (Finalizing.exchange(true, std::memory_order_acq_rel)) - return std::make_error_code(std::errc::state_not_recoverable); - return {}; +BufferQueue::ErrorCode BufferQueue::finalize() { + if (__sanitizer::atomic_exchange(&Finalizing, 1, + __sanitizer::memory_order_acq_rel)) + return ErrorCode::QueueFinalizing; + return ErrorCode::Ok; } BufferQueue::~BufferQueue() { - for (auto &Buf : Buffers) { + for (auto &T : Buffers) { + auto &Buf = std::get<0>(T); free(Buf.Buffer); - Buf.Buffer = nullptr; - Buf.Size = 0; } } diff --git a/contrib/compiler-rt/lib/xray/xray_buffer_queue.h b/contrib/compiler-rt/lib/xray/xray_buffer_queue.h index bf0b7af..e051695 100644 --- a/contrib/compiler-rt/lib/xray/xray_buffer_queue.h +++ b/contrib/compiler-rt/lib/xray/xray_buffer_queue.h @@ -15,12 +15,11 @@ #ifndef XRAY_BUFFER_QUEUE_H #define XRAY_BUFFER_QUEUE_H -#include <atomic> -#include <cstdint> +#include "sanitizer_common/sanitizer_atomic.h" +#include "sanitizer_common/sanitizer_mutex.h" #include <deque> -#include <mutex> -#include <system_error> #include <unordered_set> +#include <utility> namespace __xray { @@ -33,19 +32,47 @@ class BufferQueue { public: struct Buffer { void *Buffer = nullptr; - std::size_t Size = 0; + size_t Size = 0; }; private: - std::size_t BufferSize; - std::deque<Buffer> Buffers; - std::mutex Mutex; + size_t BufferSize; + + // We use a bool to indicate whether the Buffer has been used in this + // freelist implementation. + std::deque<std::tuple<Buffer, bool>> Buffers; + __sanitizer::BlockingMutex Mutex; std::unordered_set<void *> OwnedBuffers; - std::atomic<bool> Finalizing; + __sanitizer::atomic_uint8_t Finalizing; public: - /// Initialise a queue of size |N| with buffers of size |B|. - BufferQueue(std::size_t B, std::size_t N); + enum class ErrorCode : unsigned { + Ok, + NotEnoughMemory, + QueueFinalizing, + UnrecognizedBuffer, + AlreadyFinalized, + }; + + static const char *getErrorString(ErrorCode E) { + switch (E) { + case ErrorCode::Ok: + return "(none)"; + case ErrorCode::NotEnoughMemory: + return "no available buffers in the queue"; + case ErrorCode::QueueFinalizing: + return "queue already finalizing"; + case ErrorCode::UnrecognizedBuffer: + return "buffer being returned not owned by buffer queue"; + case ErrorCode::AlreadyFinalized: + return "queue already finalized"; + } + return "unknown error"; + } + + /// Initialise a queue of size |N| with buffers of size |B|. We report success + /// through |Success|. + BufferQueue(size_t B, size_t N, bool &Success); /// Updates |Buf| to contain the pointer to an appropriate buffer. Returns an /// error in case there are no available buffers to return when we will run @@ -58,24 +85,41 @@ public: /// - std::errc::not_enough_memory on exceeding MaxSize. /// - no error when we find a Buffer. /// - std::errc::state_not_recoverable on finalising BufferQueue. - std::error_code getBuffer(Buffer &Buf); + ErrorCode getBuffer(Buffer &Buf); /// Updates |Buf| to point to nullptr, with size 0. /// /// Returns: /// - ... - std::error_code releaseBuffer(Buffer &Buf); - - bool finalizing() const { return Finalizing.load(std::memory_order_acquire); } - - // Sets the state of the BufferQueue to finalizing, which ensures that: - // - // - All subsequent attempts to retrieve a Buffer will fail. - // - All releaseBuffer operations will not fail. - // - // After a call to finalize succeeds, all subsequent calls to finalize will - // fail with std::errc::state_not_recoverable. - std::error_code finalize(); + ErrorCode releaseBuffer(Buffer &Buf); + + bool finalizing() const { + return __sanitizer::atomic_load(&Finalizing, + __sanitizer::memory_order_acquire); + } + + /// Returns the configured size of the buffers in the buffer queue. + size_t ConfiguredBufferSize() const { return BufferSize; } + + /// Sets the state of the BufferQueue to finalizing, which ensures that: + /// + /// - All subsequent attempts to retrieve a Buffer will fail. + /// - All releaseBuffer operations will not fail. + /// + /// After a call to finalize succeeds, all subsequent calls to finalize will + /// fail with std::errc::state_not_recoverable. + ErrorCode finalize(); + + /// Applies the provided function F to each Buffer in the queue, only if the + /// Buffer is marked 'used' (i.e. has been the result of getBuffer(...) and a + /// releaseBuffer(...) operation. + template <class F> void apply(F Fn) { + __sanitizer::BlockingMutexLock G(&Mutex); + for (const auto &T : Buffers) { + if (std::get<1>(T)) + Fn(std::get<0>(T)); + } + } // Cleans up allocated buffers. ~BufferQueue(); diff --git a/contrib/compiler-rt/lib/xray/xray_emulate_tsc.h b/contrib/compiler-rt/lib/xray/xray_emulate_tsc.h deleted file mode 100644 index a3e8b1c..0000000 --- a/contrib/compiler-rt/lib/xray/xray_emulate_tsc.h +++ /dev/null @@ -1,40 +0,0 @@ -//===-- xray_emulate_tsc.h --------------------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file is a part of XRay, a dynamic runtime instrumentation system. -// -//===----------------------------------------------------------------------===// -#ifndef XRAY_EMULATE_TSC_H -#define XRAY_EMULATE_TSC_H - -#include "sanitizer_common/sanitizer_common.h" -#include "sanitizer_common/sanitizer_internal_defs.h" -#include "xray_defs.h" -#include <cerrno> -#include <cstdint> -#include <time.h> - -namespace __xray { - -static constexpr uint64_t NanosecondsPerSecond = 1000ULL * 1000 * 1000; - -ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT { - timespec TS; - int result = clock_gettime(CLOCK_REALTIME, &TS); - if (result != 0) { - Report("clock_gettime(2) returned %d, errno=%d.", result, int(errno)); - TS.tv_sec = 0; - TS.tv_nsec = 0; - } - CPU = 0; - return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec; -} -} - -#endif // XRAY_EMULATE_TSC_H diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h b/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h new file mode 100644 index 0000000..3d6d388 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h @@ -0,0 +1,66 @@ +//===-- xray_fdr_log_records.h -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_XRAY_FDR_LOG_RECORDS_H +#define XRAY_XRAY_FDR_LOG_RECORDS_H + +enum class RecordType : uint8_t { Function, Metadata }; + +// A MetadataRecord encodes the kind of record in its first byte, and have 15 +// additional bytes in the end to hold free-form data. +struct alignas(16) MetadataRecord { + // A MetadataRecord must always have a type of 1. + /* RecordType */ uint8_t Type : 1; + + // Each kind of record is represented as a 7-bit value (even though we use an + // unsigned 8-bit enum class to do so). + enum class RecordKinds : uint8_t { + NewBuffer, + EndOfBuffer, + NewCPUId, + TSCWrap, + WalltimeMarker, + CustomEventMarker, + }; + // Use 7 bits to identify this record type. + /* RecordKinds */ uint8_t RecordKind : 7; + char Data[15]; +} __attribute__((packed)); + +static_assert(sizeof(MetadataRecord) == 16, "Wrong size for MetadataRecord."); + +struct alignas(8) FunctionRecord { + // A FunctionRecord must always have a type of 0. + /* RecordType */ uint8_t Type : 1; + enum class RecordKinds { + FunctionEnter = 0x00, + FunctionExit = 0x01, + FunctionTailExit = 0x02, + }; + /* RecordKinds */ uint8_t RecordKind : 3; + + // We only use 28 bits of the function ID, so that we can use as few bytes as + // possible. This means we only support 2^28 (268,435,456) unique function ids + // in a single binary. + int FuncId : 28; + + // We use another 4 bytes to hold the delta between the previous entry's TSC. + // In case we've found that the distance is greater than the allowable 32 bits + // (either because we are running in a different CPU and the TSC might be + // different then), we should use a MetadataRecord before this FunctionRecord + // that will contain the full TSC for that CPU, and keep this to 0. + uint32_t TSCDelta; +} __attribute__((packed)); + +static_assert(sizeof(FunctionRecord) == 8, "Wrong size for FunctionRecord."); + +#endif // XRAY_XRAY_FDR_LOG_RECORDS_H diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc b/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc new file mode 100644 index 0000000..a7e1382 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc @@ -0,0 +1,300 @@ +//===-- xray_fdr_logging.cc ------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Here we implement the Flight Data Recorder mode for XRay, where we use +// compact structures to store records in memory as well as when writing out the +// data to files. +// +//===----------------------------------------------------------------------===// +#include "xray_fdr_logging.h" +#include <algorithm> +#include <bitset> +#include <cerrno> +#include <cstring> +#include <sys/syscall.h> +#include <sys/time.h> +#include <time.h> +#include <unistd.h> +#include <unordered_map> + +#include "sanitizer_common/sanitizer_atomic.h" +#include "sanitizer_common/sanitizer_common.h" +#include "xray/xray_interface.h" +#include "xray/xray_records.h" +#include "xray_buffer_queue.h" +#include "xray_defs.h" +#include "xray_fdr_logging_impl.h" +#include "xray_flags.h" +#include "xray_tsc.h" +#include "xray_utils.h" + +namespace __xray { + +// Global BufferQueue. +std::shared_ptr<BufferQueue> BQ; + +__sanitizer::atomic_sint32_t LogFlushStatus = { + XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; + +FDRLoggingOptions FDROptions; + +__sanitizer::SpinMutex FDROptionsMutex; + +// Must finalize before flushing. +XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { + if (__sanitizer::atomic_load(&LoggingStatus, + __sanitizer::memory_order_acquire) != + XRayLogInitStatus::XRAY_LOG_FINALIZED) + return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; + + s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; + if (!__sanitizer::atomic_compare_exchange_strong( + &LogFlushStatus, &Result, XRayLogFlushStatus::XRAY_LOG_FLUSHING, + __sanitizer::memory_order_release)) + return static_cast<XRayLogFlushStatus>(Result); + + // Make a copy of the BufferQueue pointer to prevent other threads that may be + // resetting it from blowing away the queue prematurely while we're dealing + // with it. + auto LocalBQ = BQ; + + // We write out the file in the following format: + // + // 1) We write down the XRay file header with version 1, type FDR_LOG. + // 2) Then we use the 'apply' member of the BufferQueue that's live, to + // ensure that at this point in time we write down the buffers that have + // been released (and marked "used") -- we dump the full buffer for now + // (fixed-sized) and let the tools reading the buffers deal with the data + // afterwards. + // + int Fd = -1; + { + __sanitizer::SpinMutexLock Guard(&FDROptionsMutex); + Fd = FDROptions.Fd; + } + if (Fd == -1) + Fd = getLogFD(); + if (Fd == -1) { + auto Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; + __sanitizer::atomic_store(&LogFlushStatus, Result, + __sanitizer::memory_order_release); + return Result; + } + + // Test for required CPU features and cache the cycle frequency + static bool TSCSupported = probeRequiredCPUFeatures(); + static uint64_t CycleFrequency = + TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond; + + XRayFileHeader Header; + Header.Version = 1; + Header.Type = FileTypes::FDR_LOG; + Header.CycleFrequency = CycleFrequency; + // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc' + // before setting the values in the header. + Header.ConstantTSC = 1; + Header.NonstopTSC = 1; + Header.FdrData = FdrAdditionalHeaderData{LocalBQ->ConfiguredBufferSize()}; + retryingWriteAll(Fd, reinterpret_cast<char *>(&Header), + reinterpret_cast<char *>(&Header) + sizeof(Header)); + + LocalBQ->apply([&](const BufferQueue::Buffer &B) { + uint64_t BufferSize = B.Size; + if (BufferSize > 0) { + retryingWriteAll(Fd, reinterpret_cast<char *>(B.Buffer), + reinterpret_cast<char *>(B.Buffer) + B.Size); + } + }); + __sanitizer::atomic_store(&LogFlushStatus, + XRayLogFlushStatus::XRAY_LOG_FLUSHED, + __sanitizer::memory_order_release); + return XRayLogFlushStatus::XRAY_LOG_FLUSHED; +} + +XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT { + s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED; + if (!__sanitizer::atomic_compare_exchange_strong( + &LoggingStatus, &CurrentStatus, + XRayLogInitStatus::XRAY_LOG_FINALIZING, + __sanitizer::memory_order_release)) + return static_cast<XRayLogInitStatus>(CurrentStatus); + + // Do special things to make the log finalize itself, and not allow any more + // operations to be performed until re-initialized. + BQ->finalize(); + + __sanitizer::atomic_store(&LoggingStatus, + XRayLogInitStatus::XRAY_LOG_FINALIZED, + __sanitizer::memory_order_release); + return XRayLogInitStatus::XRAY_LOG_FINALIZED; +} + +XRayLogInitStatus fdrLoggingReset() XRAY_NEVER_INSTRUMENT { + s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_FINALIZED; + if (__sanitizer::atomic_compare_exchange_strong( + &LoggingStatus, &CurrentStatus, + XRayLogInitStatus::XRAY_LOG_INITIALIZED, + __sanitizer::memory_order_release)) + return static_cast<XRayLogInitStatus>(CurrentStatus); + + // Release the in-memory buffer queue. + BQ.reset(); + + // Spin until the flushing status is flushed. + s32 CurrentFlushingStatus = XRayLogFlushStatus::XRAY_LOG_FLUSHED; + while (__sanitizer::atomic_compare_exchange_weak( + &LogFlushStatus, &CurrentFlushingStatus, + XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING, + __sanitizer::memory_order_release)) { + if (CurrentFlushingStatus == XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING) + break; + CurrentFlushingStatus = XRayLogFlushStatus::XRAY_LOG_FLUSHED; + } + + // At this point, we know that the status is flushed, and that we can assume + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; +} + +static std::tuple<uint64_t, unsigned char> +getTimestamp() XRAY_NEVER_INSTRUMENT { + // We want to get the TSC as early as possible, so that we can check whether + // we've seen this CPU before. We also do it before we load anything else, to + // allow for forward progress with the scheduling. + unsigned char CPU; + uint64_t TSC; + + // Test once for required CPU features + static bool TSCSupported = probeRequiredCPUFeatures(); + + if (TSCSupported) { + TSC = __xray::readTSC(CPU); + } else { + // FIXME: This code needs refactoring as it appears in multiple locations + timespec TS; + int result = clock_gettime(CLOCK_REALTIME, &TS); + if (result != 0) { + Report("clock_gettime(2) return %d, errno=%d", result, int(errno)); + TS = {0, 0}; + } + CPU = 0; + TSC = TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec; + } + return std::make_tuple(TSC, CPU); +} + +void fdrLoggingHandleArg0(int32_t FuncId, + XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { + auto TSC_CPU = getTimestamp(); + __xray_fdr_internal::processFunctionHook(FuncId, Entry, std::get<0>(TSC_CPU), + std::get<1>(TSC_CPU), clock_gettime, + LoggingStatus, BQ); +} + +void fdrLoggingHandleCustomEvent(void *Event, + std::size_t EventSize) XRAY_NEVER_INSTRUMENT { + using namespace __xray_fdr_internal; + auto TSC_CPU = getTimestamp(); + auto &TSC = std::get<0>(TSC_CPU); + auto &CPU = std::get<1>(TSC_CPU); + thread_local bool Running = false; + RecursionGuard Guard{Running}; + if (!Guard) { + assert(Running && "RecursionGuard is buggy!"); + return; + } + if (EventSize > std::numeric_limits<int32_t>::max()) { + using Empty = struct {}; + static Empty Once = [&] { + Report("Event size too large = %zu ; > max = %d\n", EventSize, + std::numeric_limits<int32_t>::max()); + return Empty(); + }(); + (void)Once; + } + int32_t ReducedEventSize = static_cast<int32_t>(EventSize); + if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, clock_gettime)) + return; + + // Here we need to prepare the log to handle: + // - The metadata record we're going to write. (16 bytes) + // - The additional data we're going to write. Currently, that's the size of + // the event we're going to dump into the log as free-form bytes. + if (!prepareBuffer(clock_gettime, MetadataRecSize + EventSize)) { + LocalBQ = nullptr; + return; + } + + // Write the custom event metadata record, which consists of the following + // information: + // - 8 bytes (64-bits) for the full TSC when the event started. + // - 4 bytes (32-bits) for the length of the data. + MetadataRecord CustomEvent; + CustomEvent.Type = uint8_t(RecordType::Metadata); + CustomEvent.RecordKind = + uint8_t(MetadataRecord::RecordKinds::CustomEventMarker); + constexpr auto TSCSize = sizeof(std::get<0>(TSC_CPU)); + std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t)); + std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize); + std::memcpy(RecordPtr, &CustomEvent, sizeof(CustomEvent)); + RecordPtr += sizeof(CustomEvent); + std::memcpy(RecordPtr, Event, ReducedEventSize); + endBufferIfFull(); +} + +XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax, + void *Options, + size_t OptionsSize) XRAY_NEVER_INSTRUMENT { + if (OptionsSize != sizeof(FDRLoggingOptions)) + return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load( + &LoggingStatus, __sanitizer::memory_order_acquire)); + s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + if (!__sanitizer::atomic_compare_exchange_strong( + &LoggingStatus, &CurrentStatus, + XRayLogInitStatus::XRAY_LOG_INITIALIZING, + __sanitizer::memory_order_release)) + return static_cast<XRayLogInitStatus>(CurrentStatus); + + { + __sanitizer::SpinMutexLock Guard(&FDROptionsMutex); + memcpy(&FDROptions, Options, OptionsSize); + } + + bool Success = false; + BQ = std::make_shared<BufferQueue>(BufferSize, BufferMax, Success); + if (!Success) { + Report("BufferQueue init failed.\n"); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } + + // Install the actual handleArg0 handler after initialising the buffers. + __xray_set_handler(fdrLoggingHandleArg0); + __xray_set_customevent_handler(fdrLoggingHandleCustomEvent); + + __sanitizer::atomic_store(&LoggingStatus, + XRayLogInitStatus::XRAY_LOG_INITIALIZED, + __sanitizer::memory_order_release); + Report("XRay FDR init successful.\n"); + return XRayLogInitStatus::XRAY_LOG_INITIALIZED; +} + +} // namespace __xray + +static auto UNUSED Unused = [] { + using namespace __xray; + if (flags()->xray_fdr_log) { + XRayLogImpl Impl{ + fdrLoggingInit, fdrLoggingFinalize, fdrLoggingHandleArg0, + fdrLoggingFlush, + }; + __xray_set_log_impl(Impl); + } + return true; +}(); diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_logging.h b/contrib/compiler-rt/lib/xray/xray_fdr_logging.h new file mode 100644 index 0000000..426b54d --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_fdr_logging.h @@ -0,0 +1,38 @@ +//===-- xray_fdr_logging.h ------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_XRAY_FDR_LOGGING_H +#define XRAY_XRAY_FDR_LOGGING_H + +#include "xray/xray_log_interface.h" +#include "xray_fdr_log_records.h" + +// FDR (Flight Data Recorder) Mode +// =============================== +// +// The XRay whitepaper describes a mode of operation for function call trace +// logging that involves writing small records into an in-memory circular +// buffer, that then gets logged to disk on demand. To do this efficiently and +// capture as much data as we can, we use smaller records compared to the +// default mode of always writing fixed-size records. + +namespace __xray { +XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax, + void *Options, size_t OptionsSize); +XRayLogInitStatus fdrLoggingFinalize(); +void fdrLoggingHandleArg0(int32_t FuncId, XRayEntryType Entry); +XRayLogFlushStatus fdrLoggingFlush(); +XRayLogInitStatus fdrLoggingReset(); + +} // namespace __xray + +#endif // XRAY_XRAY_FDR_LOGGING_H diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_logging_impl.h b/contrib/compiler-rt/lib/xray/xray_fdr_logging_impl.h new file mode 100644 index 0000000..4a1d80f --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_fdr_logging_impl.h @@ -0,0 +1,694 @@ +//===-- xray_fdr_logging_impl.h ---------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Here we implement the thread local state management and record i/o for Flight +// Data Recorder mode for XRay, where we use compact structures to store records +// in memory as well as when writing out the data to files. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_XRAY_FDR_LOGGING_IMPL_H +#define XRAY_XRAY_FDR_LOGGING_IMPL_H + +#include <cassert> +#include <cstdint> +#include <cstring> +#include <limits> +#include <memory> +#include <string> +#include <sys/syscall.h> +#include <time.h> +#include <unistd.h> + +#include "sanitizer_common/sanitizer_common.h" +#include "xray/xray_log_interface.h" +#include "xray_buffer_queue.h" +#include "xray_defs.h" +#include "xray_fdr_log_records.h" +#include "xray_flags.h" +#include "xray_tsc.h" + +namespace __xray { + +__sanitizer::atomic_sint32_t LoggingStatus = { + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; + +/// We expose some of the state transitions when FDR logging mode is operating +/// such that we can simulate a series of log events that may occur without +/// and test with determinism without worrying about the real CPU time. +/// +/// Because the code uses thread_local allocation extensively as part of its +/// design, callers that wish to test events occuring on different threads +/// will actually have to run them on different threads. +/// +/// This also means that it is possible to break invariants maintained by +/// cooperation with xray_fdr_logging class, so be careful and think twice. +namespace __xray_fdr_internal { + +/// Writes the new buffer record and wallclock time that begin a buffer for a +/// thread to MemPtr and increments MemPtr. Bypasses the thread local state +/// machine and writes directly to memory without checks. +static void writeNewBufferPreamble(pid_t Tid, timespec TS, char *&MemPtr); + +/// Write a metadata record to switch to a new CPU to MemPtr and increments +/// MemPtr. Bypasses the thread local state machine and writes directly to +/// memory without checks. +static void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC, char *&MemPtr); + +/// Writes an EOB metadata record to MemPtr and increments MemPtr. Bypasses the +/// thread local state machine and writes directly to memory without checks. +static void writeEOBMetadata(char *&MemPtr); + +/// Writes a TSC Wrap metadata record to MemPtr and increments MemPtr. Bypasses +/// the thread local state machine and directly writes to memory without checks. +static void writeTSCWrapMetadata(uint64_t TSC, char *&MemPtr); + +/// Writes a Function Record to MemPtr and increments MemPtr. Bypasses the +/// thread local state machine and writes the function record directly to +/// memory. +static void writeFunctionRecord(int FuncId, uint32_t TSCDelta, + XRayEntryType EntryType, char *&MemPtr); + +/// Sets up a new buffer in thread_local storage and writes a preamble. The +/// wall_clock_reader function is used to populate the WallTimeRecord entry. +static void setupNewBuffer(int (*wall_clock_reader)(clockid_t, + struct timespec *)); + +/// Called to record CPU time for a new CPU within the current thread. +static void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC); + +/// Called to close the buffer when the thread exhausts the buffer or when the +/// thread exits (via a thread local variable destructor). +static void writeEOBMetadata(); + +/// TSC Wrap records are written when a TSC delta encoding scheme overflows. +static void writeTSCWrapMetadata(uint64_t TSC); + +/// Here's where the meat of the processing happens. The writer captures +/// function entry, exit and tail exit points with a time and will create +/// TSCWrap, NewCPUId and Function records as necessary. The writer might +/// walk backward through its buffer and erase trivial functions to avoid +/// polluting the log and may use the buffer queue to obtain or release a +/// buffer. +static void processFunctionHook(int32_t FuncId, XRayEntryType Entry, + uint64_t TSC, unsigned char CPU, + int (*wall_clock_reader)(clockid_t, + struct timespec *), + __sanitizer::atomic_sint32_t &LoggingStatus, + const std::shared_ptr<BufferQueue> &BQ); + +//-----------------------------------------------------------------------------| +// The rest of the file is implementation. | +//-----------------------------------------------------------------------------| +// Functions are implemented in the header for inlining since we don't want | +// to grow the stack when we've hijacked the binary for logging. | +//-----------------------------------------------------------------------------| + +namespace { + +thread_local BufferQueue::Buffer Buffer; +thread_local char *RecordPtr = nullptr; + +// The number of FunctionEntry records immediately preceding RecordPtr. +thread_local uint8_t NumConsecutiveFnEnters = 0; + +// The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit +// records preceding RecordPtr. +thread_local uint8_t NumTailCalls = 0; + +constexpr auto MetadataRecSize = sizeof(MetadataRecord); +constexpr auto FunctionRecSize = sizeof(FunctionRecord); + +// We use a thread_local variable to keep track of which CPUs we've already +// run, and the TSC times for these CPUs. This allows us to stop repeating the +// CPU field in the function records. +// +// We assume that we'll support only 65536 CPUs for x86_64. +thread_local uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max(); +thread_local uint64_t LastTSC = 0; +thread_local uint64_t LastFunctionEntryTSC = 0; + +class ThreadExitBufferCleanup { + std::shared_ptr<BufferQueue> &Buffers; + BufferQueue::Buffer &Buffer; + +public: + explicit ThreadExitBufferCleanup(std::shared_ptr<BufferQueue> &BQ, + BufferQueue::Buffer &Buffer) + XRAY_NEVER_INSTRUMENT : Buffers(BQ), + Buffer(Buffer) {} + + ~ThreadExitBufferCleanup() noexcept XRAY_NEVER_INSTRUMENT { + if (RecordPtr == nullptr) + return; + + // We make sure that upon exit, a thread will write out the EOB + // MetadataRecord in the thread-local log, and also release the buffer to + // the queue. + assert((RecordPtr + MetadataRecSize) - static_cast<char *>(Buffer.Buffer) >= + static_cast<ptrdiff_t>(MetadataRecSize)); + if (Buffers) { + writeEOBMetadata(); + auto EC = Buffers->releaseBuffer(Buffer); + if (EC != BufferQueue::ErrorCode::Ok) + Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer, + BufferQueue::getErrorString(EC)); + Buffers = nullptr; + return; + } + } +}; + +// Make sure a thread that's ever called handleArg0 has a thread-local +// live reference to the buffer queue for this particular instance of +// FDRLogging, and that we're going to clean it up when the thread exits. +thread_local std::shared_ptr<BufferQueue> LocalBQ = nullptr; +thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer); + +class RecursionGuard { + bool &Running; + const bool Valid; + +public: + explicit RecursionGuard(bool &R) : Running(R), Valid(!R) { + if (Valid) + Running = true; + } + + RecursionGuard(const RecursionGuard &) = delete; + RecursionGuard(RecursionGuard &&) = delete; + RecursionGuard &operator=(const RecursionGuard &) = delete; + RecursionGuard &operator=(RecursionGuard &&) = delete; + + explicit operator bool() const { return Valid; } + + ~RecursionGuard() noexcept { + if (Valid) + Running = false; + } +}; + +inline bool loggingInitialized( + const __sanitizer::atomic_sint32_t &LoggingStatus) XRAY_NEVER_INSTRUMENT { + return __sanitizer::atomic_load(&LoggingStatus, + __sanitizer::memory_order_acquire) == + XRayLogInitStatus::XRAY_LOG_INITIALIZED; +} + +} // namespace + +inline void writeNewBufferPreamble(pid_t Tid, timespec TS, + char *&MemPtr) XRAY_NEVER_INSTRUMENT { + static constexpr int InitRecordsCount = 2; + std::aligned_storage<sizeof(MetadataRecord)>::type Records[InitRecordsCount]; + { + // Write out a MetadataRecord to signify that this is the start of a new + // buffer, associated with a particular thread, with a new CPU. For the + // data, we have 15 bytes to squeeze as much information as we can. At this + // point we only write down the following bytes: + // - Thread ID (pid_t, 4 bytes) + auto &NewBuffer = *reinterpret_cast<MetadataRecord *>(&Records[0]); + NewBuffer.Type = uint8_t(RecordType::Metadata); + NewBuffer.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewBuffer); + std::memcpy(&NewBuffer.Data, &Tid, sizeof(pid_t)); + } + // Also write the WalltimeMarker record. + { + static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes"); + auto &WalltimeMarker = *reinterpret_cast<MetadataRecord *>(&Records[1]); + WalltimeMarker.Type = uint8_t(RecordType::Metadata); + WalltimeMarker.RecordKind = + uint8_t(MetadataRecord::RecordKinds::WalltimeMarker); + + // We only really need microsecond precision here, and enforce across + // platforms that we need 64-bit seconds and 32-bit microseconds encoded in + // the Metadata record. + int32_t Micros = TS.tv_nsec / 1000; + int64_t Seconds = TS.tv_sec; + std::memcpy(WalltimeMarker.Data, &Seconds, sizeof(Seconds)); + std::memcpy(WalltimeMarker.Data + sizeof(Seconds), &Micros, sizeof(Micros)); + } + std::memcpy(MemPtr, Records, sizeof(MetadataRecord) * InitRecordsCount); + MemPtr += sizeof(MetadataRecord) * InitRecordsCount; + NumConsecutiveFnEnters = 0; + NumTailCalls = 0; +} + +inline void setupNewBuffer(int (*wall_clock_reader)( + clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT { + RecordPtr = static_cast<char *>(Buffer.Buffer); + pid_t Tid = syscall(SYS_gettid); + timespec TS{0, 0}; + // This is typically clock_gettime, but callers have injection ability. + wall_clock_reader(CLOCK_MONOTONIC, &TS); + writeNewBufferPreamble(Tid, TS, RecordPtr); + NumConsecutiveFnEnters = 0; + NumTailCalls = 0; +} + +inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC, + char *&MemPtr) XRAY_NEVER_INSTRUMENT { + MetadataRecord NewCPUId; + NewCPUId.Type = uint8_t(RecordType::Metadata); + NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId); + + // The data for the New CPU will contain the following bytes: + // - CPU ID (uint16_t, 2 bytes) + // - Full TSC (uint64_t, 8 bytes) + // Total = 10 bytes. + std::memcpy(&NewCPUId.Data, &CPU, sizeof(CPU)); + std::memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC)); + std::memcpy(MemPtr, &NewCPUId, sizeof(MetadataRecord)); + MemPtr += sizeof(MetadataRecord); + NumConsecutiveFnEnters = 0; + NumTailCalls = 0; +} + +inline void writeNewCPUIdMetadata(uint16_t CPU, + uint64_t TSC) XRAY_NEVER_INSTRUMENT { + writeNewCPUIdMetadata(CPU, TSC, RecordPtr); +} + +inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT { + MetadataRecord EOBMeta; + EOBMeta.Type = uint8_t(RecordType::Metadata); + EOBMeta.RecordKind = uint8_t(MetadataRecord::RecordKinds::EndOfBuffer); + // For now we don't write any bytes into the Data field. + std::memcpy(MemPtr, &EOBMeta, sizeof(MetadataRecord)); + MemPtr += sizeof(MetadataRecord); + NumConsecutiveFnEnters = 0; + NumTailCalls = 0; +} + +inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT { + writeEOBMetadata(RecordPtr); +} + +inline void writeTSCWrapMetadata(uint64_t TSC, + char *&MemPtr) XRAY_NEVER_INSTRUMENT { + MetadataRecord TSCWrap; + TSCWrap.Type = uint8_t(RecordType::Metadata); + TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap); + + // The data for the TSCWrap record contains the following bytes: + // - Full TSC (uint64_t, 8 bytes) + // Total = 8 bytes. + std::memcpy(&TSCWrap.Data, &TSC, sizeof(TSC)); + std::memcpy(MemPtr, &TSCWrap, sizeof(MetadataRecord)); + MemPtr += sizeof(MetadataRecord); + NumConsecutiveFnEnters = 0; + NumTailCalls = 0; +} + +inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT { + writeTSCWrapMetadata(TSC, RecordPtr); +} + +inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, + XRayEntryType EntryType, + char *&MemPtr) XRAY_NEVER_INSTRUMENT { + std::aligned_storage<sizeof(FunctionRecord), alignof(FunctionRecord)>::type + AlignedFuncRecordBuffer; + auto &FuncRecord = + *reinterpret_cast<FunctionRecord *>(&AlignedFuncRecordBuffer); + FuncRecord.Type = uint8_t(RecordType::Function); + // Only take 28 bits of the function id. + FuncRecord.FuncId = FuncId & ~(0x0F << 28); + FuncRecord.TSCDelta = TSCDelta; + + switch (EntryType) { + case XRayEntryType::ENTRY: + ++NumConsecutiveFnEnters; + FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); + break; + case XRayEntryType::LOG_ARGS_ENTRY: + // We should not rewind functions with logged args. + NumConsecutiveFnEnters = 0; + NumTailCalls = 0; + FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); + break; + case XRayEntryType::EXIT: + // If we've decided to log the function exit, we will never erase the log + // before it. + NumConsecutiveFnEnters = 0; + NumTailCalls = 0; + FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit); + break; + case XRayEntryType::TAIL: + // If we just entered the function we're tail exiting from or erased every + // invocation since then, this function entry tail pair is a candidate to + // be erased when the child function exits. + if (NumConsecutiveFnEnters > 0) { + ++NumTailCalls; + NumConsecutiveFnEnters = 0; + } else { + // We will never be able to erase this tail call since we have logged + // something in between the function entry and tail exit. + NumTailCalls = 0; + NumConsecutiveFnEnters = 0; + } + FuncRecord.RecordKind = + uint8_t(FunctionRecord::RecordKinds::FunctionTailExit); + break; + case XRayEntryType::CUSTOM_EVENT: { + // This is a bug in patching, so we'll report it once and move on. + static bool Once = [&] { + Report("Internal error: patched an XRay custom event call as a function; " + "func id = %d\n", + FuncId); + return true; + }(); + (void)Once; + return; + } + } + + std::memcpy(MemPtr, &AlignedFuncRecordBuffer, sizeof(FunctionRecord)); + MemPtr += sizeof(FunctionRecord); +} + +static uint64_t thresholdTicks() { + static uint64_t TicksPerSec = probeRequiredCPUFeatures() + ? getTSCFrequency() + : __xray::NanosecondsPerSecond; + static const uint64_t ThresholdTicks = + TicksPerSec * flags()->xray_fdr_log_func_duration_threshold_us / 1000000; + return ThresholdTicks; +} + +// Re-point the thread local pointer into this thread's Buffer before the recent +// "Function Entry" record and any "Tail Call Exit" records after that. +static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC, + uint64_t &LastFunctionEntryTSC, int32_t FuncId) { + using AlignedFuncStorage = + std::aligned_storage<sizeof(FunctionRecord), + alignof(FunctionRecord)>::type; + RecordPtr -= FunctionRecSize; + AlignedFuncStorage AlignedFuncRecordBuffer; + const auto &FuncRecord = *reinterpret_cast<FunctionRecord *>( + std::memcpy(&AlignedFuncRecordBuffer, RecordPtr, FunctionRecSize)); + assert(FuncRecord.RecordKind == + uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && + "Expected to find function entry recording when rewinding."); + assert(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) && + "Expected matching function id when rewinding Exit"); + --NumConsecutiveFnEnters; + LastTSC -= FuncRecord.TSCDelta; + + // We unwound one call. Update the state and return without writing a log. + if (NumConsecutiveFnEnters != 0) { + LastFunctionEntryTSC -= FuncRecord.TSCDelta; + return; + } + + // Otherwise we've rewound the stack of all function entries, we might be + // able to rewind further by erasing tail call functions that are being + // exited from via this exit. + LastFunctionEntryTSC = 0; + auto RewindingTSC = LastTSC; + auto RewindingRecordPtr = RecordPtr - FunctionRecSize; + while (NumTailCalls > 0) { + AlignedFuncStorage TailExitRecordBuffer; + // Rewind the TSC back over the TAIL EXIT record. + const auto &ExpectedTailExit = + *reinterpret_cast<FunctionRecord *>(std::memcpy( + &TailExitRecordBuffer, RewindingRecordPtr, FunctionRecSize)); + + assert(ExpectedTailExit.RecordKind == + uint8_t(FunctionRecord::RecordKinds::FunctionTailExit) && + "Expected to find tail exit when rewinding."); + RewindingRecordPtr -= FunctionRecSize; + RewindingTSC -= ExpectedTailExit.TSCDelta; + AlignedFuncStorage FunctionEntryBuffer; + const auto &ExpectedFunctionEntry = *reinterpret_cast<FunctionRecord *>( + std::memcpy(&FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize)); + assert(ExpectedFunctionEntry.RecordKind == + uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && + "Expected to find function entry when rewinding tail call."); + assert(ExpectedFunctionEntry.FuncId == ExpectedTailExit.FuncId && + "Expected funcids to match when rewinding tail call."); + + // This tail call exceeded the threshold duration. It will not be erased. + if ((TSC - RewindingTSC) >= thresholdTicks()) { + NumTailCalls = 0; + return; + } + + // We can erase a tail exit pair that we're exiting through since + // its duration is under threshold. + --NumTailCalls; + RewindingRecordPtr -= FunctionRecSize; + RewindingTSC -= ExpectedFunctionEntry.TSCDelta; + RecordPtr -= 2 * FunctionRecSize; + LastTSC = RewindingTSC; + } +} + +inline bool releaseThreadLocalBuffer(BufferQueue *BQ) { + auto EC = BQ->releaseBuffer(Buffer); + if (EC != BufferQueue::ErrorCode::Ok) { + Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer, + BufferQueue::getErrorString(EC)); + return false; + } + return true; +} + +inline bool prepareBuffer(int (*wall_clock_reader)(clockid_t, + struct timespec *), + size_t MaxSize) XRAY_NEVER_INSTRUMENT { + char *BufferStart = static_cast<char *>(Buffer.Buffer); + if ((RecordPtr + MaxSize) > (BufferStart + Buffer.Size - MetadataRecSize)) { + writeEOBMetadata(); + if (!releaseThreadLocalBuffer(LocalBQ.get())) + return false; + auto EC = LocalBQ->getBuffer(Buffer); + if (EC != BufferQueue::ErrorCode::Ok) { + Report("Failed to acquire a buffer; error=%s\n", + BufferQueue::getErrorString(EC)); + return false; + } + setupNewBuffer(wall_clock_reader); + } + return true; +} + +inline bool isLogInitializedAndReady( + std::shared_ptr<BufferQueue> &LocalBQ, uint64_t TSC, unsigned char CPU, + int (*wall_clock_reader)(clockid_t, + struct timespec *)) XRAY_NEVER_INSTRUMENT { + // Bail out right away if logging is not initialized yet. + // We should take the opportunity to release the buffer though. + auto Status = __sanitizer::atomic_load(&LoggingStatus, + __sanitizer::memory_order_acquire); + if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) { + if (RecordPtr != nullptr && + (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || + Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) { + writeEOBMetadata(); + if (!releaseThreadLocalBuffer(LocalBQ.get())) + return false; + RecordPtr = nullptr; + LocalBQ = nullptr; + return false; + } + return false; + } + + if (!loggingInitialized(LoggingStatus) || LocalBQ->finalizing()) { + writeEOBMetadata(); + if (!releaseThreadLocalBuffer(LocalBQ.get())) + return false; + RecordPtr = nullptr; + } + + if (Buffer.Buffer == nullptr) { + auto EC = LocalBQ->getBuffer(Buffer); + if (EC != BufferQueue::ErrorCode::Ok) { + auto LS = __sanitizer::atomic_load(&LoggingStatus, + __sanitizer::memory_order_acquire); + if (LS != XRayLogInitStatus::XRAY_LOG_FINALIZING && + LS != XRayLogInitStatus::XRAY_LOG_FINALIZED) + Report("Failed to acquire a buffer; error=%s\n", + BufferQueue::getErrorString(EC)); + return false; + } + + setupNewBuffer(wall_clock_reader); + } + + if (CurrentCPU == std::numeric_limits<uint16_t>::max()) { + // This means this is the first CPU this thread has ever run on. We set + // the current CPU and record this as the first TSC we've seen. + CurrentCPU = CPU; + writeNewCPUIdMetadata(CPU, TSC); + } + + return true; +} // namespace __xray_fdr_internal + +inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT { + auto BufferStart = static_cast<char *>(Buffer.Buffer); + if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) { + writeEOBMetadata(); + if (!releaseThreadLocalBuffer(LocalBQ.get())) + return; + RecordPtr = nullptr; + } +} + +inline void processFunctionHook( + int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU, + int (*wall_clock_reader)(clockid_t, struct timespec *), + __sanitizer::atomic_sint32_t &LoggingStatus, + const std::shared_ptr<BufferQueue> &BQ) XRAY_NEVER_INSTRUMENT { + // Prevent signal handler recursion, so in case we're already in a log writing + // mode and the signal handler comes in (and is also instrumented) then we + // don't want to be clobbering potentially partial writes already happening in + // the thread. We use a simple thread_local latch to only allow one on-going + // handleArg0 to happen at any given time. + thread_local bool Running = false; + RecursionGuard Guard{Running}; + if (!Guard) { + assert(Running == true && "RecursionGuard is buggy!"); + return; + } + + // In case the reference has been cleaned up before, we make sure we + // initialize it to the provided BufferQueue. + if (LocalBQ == nullptr) + LocalBQ = BQ; + + if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, wall_clock_reader)) + return; + + // Before we go setting up writing new function entries, we need to be really + // careful about the pointer math we're doing. This means we need to ensure + // that the record we are about to write is going to fit into the buffer, + // without overflowing the buffer. + // + // To do this properly, we use the following assumptions: + // + // - The least number of bytes we will ever write is 8 + // (sizeof(FunctionRecord)) only if the delta between the previous entry + // and this entry is within 32 bits. + // - The most number of bytes we will ever write is 8 + 16 = 24. This is + // computed by: + // + // sizeof(FunctionRecord) + sizeof(MetadataRecord) + // + // These arise in the following cases: + // + // 1. When the delta between the TSC we get and the previous TSC for the + // same CPU is outside of the uint32_t range, we end up having to + // write a MetadataRecord to indicate a "tsc wrap" before the actual + // FunctionRecord. + // 2. When we learn that we've moved CPUs, we need to write a + // MetadataRecord to indicate a "cpu change", and thus write out the + // current TSC for that CPU before writing out the actual + // FunctionRecord. + // 3. When we learn about a new CPU ID, we need to write down a "new cpu + // id" MetadataRecord before writing out the actual FunctionRecord. + // + // - An End-of-Buffer (EOB) MetadataRecord is 16 bytes. + // + // So the math we need to do is to determine whether writing 24 bytes past the + // current pointer leaves us with enough bytes to write the EOB + // MetadataRecord. If we don't have enough space after writing as much as 24 + // bytes in the end of the buffer, we need to write out the EOB, get a new + // Buffer, set it up properly before doing any further writing. + // + if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) { + LocalBQ = nullptr; + return; + } + + // By this point, we are now ready to write at most 24 bytes (one metadata + // record and one function record). + assert((RecordPtr + (MetadataRecSize + FunctionRecSize)) - + static_cast<char *>(Buffer.Buffer) >= + static_cast<ptrdiff_t>(MetadataRecSize) && + "Misconfigured BufferQueue provided; Buffer size not large enough."); + + // Here we compute the TSC Delta. There are a few interesting situations we + // need to account for: + // + // - The thread has migrated to a different CPU. If this is the case, then + // we write down the following records: + // + // 1. A 'NewCPUId' Metadata record. + // 2. A FunctionRecord with a 0 for the TSCDelta field. + // + // - The TSC delta is greater than the 32 bits we can store in a + // FunctionRecord. In this case we write down the following records: + // + // 1. A 'TSCWrap' Metadata record. + // 2. A FunctionRecord with a 0 for the TSCDelta field. + // + // - The TSC delta is representable within the 32 bits we can store in a + // FunctionRecord. In this case we write down just a FunctionRecord with + // the correct TSC delta. + // + uint32_t RecordTSCDelta = 0; + if (CPU != CurrentCPU) { + // We've moved to a new CPU. + writeNewCPUIdMetadata(CPU, TSC); + } else { + // If the delta is greater than the range for a uint32_t, then we write out + // the TSC wrap metadata entry with the full TSC, and the TSC for the + // function record be 0. + auto Delta = TSC - LastTSC; + if (Delta > (1ULL << 32) - 1) + writeTSCWrapMetadata(TSC); + else + RecordTSCDelta = Delta; + } + + LastTSC = TSC; + CurrentCPU = CPU; + switch (Entry) { + case XRayEntryType::ENTRY: + case XRayEntryType::LOG_ARGS_ENTRY: + // Update the thread local state for the next invocation. + LastFunctionEntryTSC = TSC; + break; + case XRayEntryType::TAIL: + break; + case XRayEntryType::EXIT: + // Break out and write the exit record if we can't erase any functions. + if (NumConsecutiveFnEnters == 0 || + (TSC - LastFunctionEntryTSC) >= thresholdTicks()) + break; + rewindRecentCall(TSC, LastTSC, LastFunctionEntryTSC, FuncId); + return; // without writing log. + case XRayEntryType::CUSTOM_EVENT: { + // This is a bug in patching, so we'll report it once and move on. + static bool Once = [&] { + Report("Internal error: patched an XRay custom event call as a function; " + "func id = %d", + FuncId); + return true; + }(); + (void)Once; + return; + } + } + + writeFunctionRecord(FuncId, RecordTSCDelta, Entry, RecordPtr); + + // If we've exhausted the buffer by this time, we then release the buffer to + // make sure that other threads may start using this buffer. + endBufferIfFull(); +} + +} // namespace __xray_fdr_internal +} // namespace __xray + +#endif // XRAY_XRAY_FDR_LOGGING_IMPL_H diff --git a/contrib/compiler-rt/lib/xray/xray_flags.cc b/contrib/compiler-rt/lib/xray/xray_flags.cc index 338c237..1ee4d10 100644 --- a/contrib/compiler-rt/lib/xray/xray_flags.cc +++ b/contrib/compiler-rt/lib/xray/xray_flags.cc @@ -24,31 +24,55 @@ namespace __xray { Flags xray_flags_dont_use_directly; // use via flags(). -void Flags::SetDefaults() XRAY_NEVER_INSTRUMENT { +void Flags::setDefaults() XRAY_NEVER_INSTRUMENT { #define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue; #include "xray_flags.inc" #undef XRAY_FLAG } -static void RegisterXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT { +static void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT { #define XRAY_FLAG(Type, Name, DefaultValue, Description) \ RegisterFlag(P, #Name, Description, &F->Name); #include "xray_flags.inc" #undef XRAY_FLAG } -void InitializeFlags() XRAY_NEVER_INSTRUMENT { +// This function, as defined with the help of a macro meant to be introduced at +// build time of the XRay runtime, passes in a statically defined list of +// options that control XRay. This means users/deployments can tweak the +// defaults that override the hard-coded defaults in the xray_flags.inc at +// compile-time using the XRAY_DEFAULT_OPTIONS macro. +static const char *useCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT { +#ifdef XRAY_DEFAULT_OPTIONS +// Do the double-layered string conversion to prevent badly crafted strings +// provided through the XRAY_DEFAULT_OPTIONS from causing compilation issues (or +// changing the semantics of the implementation through the macro). This ensures +// that we convert whatever XRAY_DEFAULT_OPTIONS is defined as a string literal. +#define XRAY_STRINGIZE(x) #x +#define XRAY_STRINGIZE_OPTIONS(options) XRAY_STRINGIZE(options) + return XRAY_STRINGIZE_OPTIONS(XRAY_DEFAULT_OPTIONS); +#else + return ""; +#endif +} + +void initializeFlags() XRAY_NEVER_INSTRUMENT { SetCommonFlagsDefaults(); auto *F = flags(); - F->SetDefaults(); + F->setDefaults(); FlagParser XRayParser; - RegisterXRayFlags(&XRayParser, F); + registerXRayFlags(&XRayParser, F); RegisterCommonFlags(&XRayParser); - // Override from command line. + // Use options defaulted at compile-time for the runtime. + const char *XRayCompileFlags = useCompilerDefinedFlags(); + XRayParser.ParseString(XRayCompileFlags); + + // Override from environment variables. XRayParser.ParseString(GetEnv("XRAY_OPTIONS")); + // Override from command line. InitializeCommonFlags(); if (Verbosity()) diff --git a/contrib/compiler-rt/lib/xray/xray_flags.h b/contrib/compiler-rt/lib/xray/xray_flags.h index 2ecf5fb..f4e3028 100644 --- a/contrib/compiler-rt/lib/xray/xray_flags.h +++ b/contrib/compiler-rt/lib/xray/xray_flags.h @@ -24,13 +24,13 @@ struct Flags { #include "xray_flags.inc" #undef XRAY_FLAG - void SetDefaults(); + void setDefaults(); }; extern Flags xray_flags_dont_use_directly; inline Flags *flags() { return &xray_flags_dont_use_directly; } -void InitializeFlags(); +void initializeFlags(); } // namespace __xray diff --git a/contrib/compiler-rt/lib/xray/xray_flags.inc b/contrib/compiler-rt/lib/xray/xray_flags.inc index 0f6ced8..7ddce78 100644 --- a/contrib/compiler-rt/lib/xray/xray_flags.inc +++ b/contrib/compiler-rt/lib/xray/xray_flags.inc @@ -14,9 +14,14 @@ #error "Define XRAY_FLAG prior to including this file!" #endif -XRAY_FLAG(bool, patch_premain, true, +XRAY_FLAG(bool, patch_premain, false, "Whether to patch instrumentation points before main.") XRAY_FLAG(bool, xray_naive_log, true, "Whether to install the naive log implementation.") XRAY_FLAG(const char *, xray_logfile_base, "xray-log.", "Filename base for the xray logfile.") +XRAY_FLAG(bool, xray_fdr_log, false, + "Whether to install the flight data recorder logging implementation.") +XRAY_FLAG(int, xray_fdr_log_func_duration_threshold_us, 5, + "FDR logging will try to skip functions that execute for fewer " + "microseconds than this threshold.") diff --git a/contrib/compiler-rt/lib/xray/xray_init.cc b/contrib/compiler-rt/lib/xray/xray_init.cc index eb86182..aa660ba 100644 --- a/contrib/compiler-rt/lib/xray/xray_init.cc +++ b/contrib/compiler-rt/lib/xray/xray_init.cc @@ -12,7 +12,6 @@ // XRay initialisation logic. //===----------------------------------------------------------------------===// -#include <atomic> #include <fcntl.h> #include <strings.h> #include <unistd.h> @@ -26,9 +25,10 @@ extern "C" { void __xray_init(); extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak)); extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak)); +extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak)); +extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak)); } -using namespace __sanitizer; using namespace __xray; // When set to 'true' this means the XRay runtime has been initialised. We use @@ -38,29 +38,30 @@ using namespace __xray; // // FIXME: Support DSO instrumentation maps too. The current solution only works // for statically linked executables. -std::atomic<bool> XRayInitialized{false}; +__sanitizer::atomic_uint8_t XRayInitialized{0}; // This should always be updated before XRayInitialized is updated. -std::atomic<__xray::XRaySledMap> XRayInstrMap{}; +__sanitizer::SpinMutex XRayInstrMapMutex; +XRaySledMap XRayInstrMap; // __xray_init() will do the actual loading of the current process' memory map // and then proceed to look for the .xray_instr_map section/segment. void __xray_init() XRAY_NEVER_INSTRUMENT { - InitializeFlags(); + initializeFlags(); if (__start_xray_instr_map == nullptr) { Report("XRay instrumentation map missing. Not initializing XRay.\n"); return; } - // Now initialize the XRayInstrMap global struct with the address of the - // entries, reinterpreted as an array of XRaySledEntry objects. We use the - // virtual pointer we have from the section to provide us the correct - // information. - __xray::XRaySledMap SledMap{}; - SledMap.Sleds = __start_xray_instr_map; - SledMap.Entries = __stop_xray_instr_map - __start_xray_instr_map; - XRayInstrMap.store(SledMap, std::memory_order_release); - XRayInitialized.store(true, std::memory_order_release); + { + __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); + XRayInstrMap.Sleds = __start_xray_instr_map; + XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map; + XRayInstrMap.SledsIndex = __start_xray_fn_idx; + XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx; + } + __sanitizer::atomic_store(&XRayInitialized, true, + __sanitizer::memory_order_release); if (flags()->patch_premain) __xray_patch(); diff --git a/contrib/compiler-rt/lib/xray/xray_inmemory_log.cc b/contrib/compiler-rt/lib/xray/xray_inmemory_log.cc index adcb216..83aecfa 100644 --- a/contrib/compiler-rt/lib/xray/xray_inmemory_log.cc +++ b/contrib/compiler-rt/lib/xray/xray_inmemory_log.cc @@ -16,8 +16,6 @@ //===----------------------------------------------------------------------===// #include <cassert> -#include <cstdint> -#include <cstdio> #include <fcntl.h> #include <mutex> #include <sys/stat.h> @@ -26,19 +24,13 @@ #include <thread> #include <unistd.h> -#if defined(__x86_64__) -#include "xray_x86_64.h" -#elif defined(__arm__) || defined(__aarch64__) -#include "xray_emulate_tsc.h" -#else -#error "Unsupported CPU Architecture" -#endif /* Architecture-specific inline intrinsics */ - #include "sanitizer_common/sanitizer_libc.h" #include "xray/xray_records.h" #include "xray_defs.h" #include "xray_flags.h" #include "xray_interface_internal.h" +#include "xray_tsc.h" +#include "xray_utils.h" // __xray_InMemoryRawLog will use a thread-local aligned buffer capped to a // certain size (32kb by default) and use it as if it were a circular buffer for @@ -53,25 +45,6 @@ namespace __xray { std::mutex LogMutex; -static void retryingWriteAll(int Fd, char *Begin, - char *End) XRAY_NEVER_INSTRUMENT { - if (Begin == End) - return; - auto TotalBytes = std::distance(Begin, End); - while (auto Written = write(Fd, Begin, TotalBytes)) { - if (Written < 0) { - if (errno == EINTR) - continue; // Try again. - Report("Failed to write; errno = %d\n", errno); - return; - } - TotalBytes -= Written; - if (TotalBytes == 0) - break; - Begin += Written; - } -} - class ThreadExitFlusher { int Fd; XRayRecord *Start; @@ -102,41 +75,15 @@ public: using namespace __xray; -void PrintToStdErr(const char *Buffer) XRAY_NEVER_INSTRUMENT { - fprintf(stderr, "%s", Buffer); -} - static int __xray_OpenLogFile() XRAY_NEVER_INSTRUMENT { - // FIXME: Figure out how to make this less stderr-dependent. - SetPrintfAndReportCallback(PrintToStdErr); - // Open a temporary file once for the log. - static char TmpFilename[256] = {}; - static char TmpWildcardPattern[] = "XXXXXX"; - auto Argv = GetArgv(); - const char *Progname = Argv[0] == nullptr ? "(unknown)" : Argv[0]; - const char *LastSlash = internal_strrchr(Progname, '/'); - - if (LastSlash != nullptr) - Progname = LastSlash + 1; - - const int HalfLength = sizeof(TmpFilename) / 2 - sizeof(TmpWildcardPattern); - int NeededLength = internal_snprintf(TmpFilename, sizeof(TmpFilename), - "%.*s%.*s.%s", - HalfLength, flags()->xray_logfile_base, - HalfLength, Progname, - TmpWildcardPattern); - if (NeededLength > int(sizeof(TmpFilename))) { - Report("XRay log file name too long (%d): %s\n", NeededLength, TmpFilename); - return -1; - } - int Fd = mkstemp(TmpFilename); - if (Fd == -1) { - Report("XRay: Failed opening temporary file '%s'; not logging events.\n", - TmpFilename); + int F = getLogFD(); + if (F == -1) return -1; - } - if (Verbosity()) - fprintf(stderr, "XRay: Log file in '%s'\n", TmpFilename); + + // Test for required CPU features and cache the cycle frequency + static bool TSCSupported = probeRequiredCPUFeatures(); + static uint64_t CycleFrequency = TSCSupported ? getTSCFrequency() + : __xray::NanosecondsPerSecond; // Since we're here, we get to write the header. We set it up so that the // header will only be written once, at the start, and let the threads @@ -144,19 +91,20 @@ static int __xray_OpenLogFile() XRAY_NEVER_INSTRUMENT { XRayFileHeader Header; Header.Version = 1; Header.Type = FileTypes::NAIVE_LOG; - Header.CycleFrequency = __xray::cycleFrequency(); + Header.CycleFrequency = CycleFrequency; // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc' // before setting the values in the header. Header.ConstantTSC = 1; Header.NonstopTSC = 1; - retryingWriteAll(Fd, reinterpret_cast<char *>(&Header), + retryingWriteAll(F, reinterpret_cast<char *>(&Header), reinterpret_cast<char *>(&Header) + sizeof(Header)); - return Fd; + return F; } -void __xray_InMemoryRawLog(int32_t FuncId, - XRayEntryType Type) XRAY_NEVER_INSTRUMENT { +template <class RDTSC> +void __xray_InMemoryRawLog(int32_t FuncId, XRayEntryType Type, + RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT { using Buffer = std::aligned_storage<sizeof(XRayRecord), alignof(XRayRecord)>::type; static constexpr size_t BuffLen = 1024; @@ -173,7 +121,7 @@ void __xray_InMemoryRawLog(int32_t FuncId, // through a pointer offset. auto &R = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer)[Offset]; R.RecordType = RecordTypes::NORMAL; - R.TSC = __xray::readTSC(R.CPU); + R.TSC = ReadTSC(R.CPU); R.TId = TId; R.Type = Type; R.FuncId = FuncId; @@ -187,8 +135,32 @@ void __xray_InMemoryRawLog(int32_t FuncId, } } -static auto Unused = [] { +void __xray_InMemoryRawLogRealTSC(int32_t FuncId, + XRayEntryType Type) XRAY_NEVER_INSTRUMENT { + __xray_InMemoryRawLog(FuncId, Type, __xray::readTSC); +} + +void __xray_InMemoryEmulateTSC(int32_t FuncId, + XRayEntryType Type) XRAY_NEVER_INSTRUMENT { + __xray_InMemoryRawLog(FuncId, Type, [](uint8_t &CPU) XRAY_NEVER_INSTRUMENT { + timespec TS; + int result = clock_gettime(CLOCK_REALTIME, &TS); + if (result != 0) { + Report("clock_gettimg(2) return %d, errno=%d.", result, int(errno)); + TS = {0, 0}; + } + CPU = 0; + return TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec; + }); +} + +static auto UNUSED Unused = [] { + auto UseRealTSC = probeRequiredCPUFeatures(); + if (!UseRealTSC) + Report("WARNING: Required CPU features missing for XRay instrumentation, " + "using emulation instead.\n"); if (flags()->xray_naive_log) - __xray_set_handler(__xray_InMemoryRawLog); + __xray_set_handler(UseRealTSC ? __xray_InMemoryRawLogRealTSC + : __xray_InMemoryEmulateTSC); return true; }(); diff --git a/contrib/compiler-rt/lib/xray/xray_interface.cc b/contrib/compiler-rt/lib/xray/xray_interface.cc index 20a2b66..694d34c 100644 --- a/contrib/compiler-rt/lib/xray/xray_interface.cc +++ b/contrib/compiler-rt/lib/xray/xray_interface.cc @@ -15,7 +15,6 @@ #include "xray_interface_internal.h" -#include <atomic> #include <cstdint> #include <cstdio> #include <errno.h> @@ -35,12 +34,24 @@ static const int16_t cSledLength = 12; static const int16_t cSledLength = 32; #elif defined(__arm__) static const int16_t cSledLength = 28; +#elif SANITIZER_MIPS32 +static const int16_t cSledLength = 48; +#elif SANITIZER_MIPS64 +static const int16_t cSledLength = 64; +#elif defined(__powerpc64__) +static const int16_t cSledLength = 8; #else #error "Unsupported CPU Architecture" #endif /* CPU architecture */ // This is the function to call when we encounter the entry or exit sleds. -std::atomic<void (*)(int32_t, XRayEntryType)> XRayPatchedFunction{nullptr}; +__sanitizer::atomic_uintptr_t XRayPatchedFunction{0}; + +// This is the function to call from the arg1-enabled sleds/trampolines. +__sanitizer::atomic_uintptr_t XRayArgLogger{0}; + +// This is the function to call when we encounter a custom event log call. +__sanitizer::atomic_uintptr_t XRayPatchedCustomEvent{0}; // MProtectHelper is an RAII wrapper for calls to mprotect(...) that will undo // any successful mprotect(...) changes. This is used to make a page writeable @@ -79,23 +90,45 @@ public: } // namespace __xray -extern std::atomic<bool> XRayInitialized; -extern std::atomic<__xray::XRaySledMap> XRayInstrMap; +extern __sanitizer::SpinMutex XRayInstrMapMutex; +extern __sanitizer::atomic_uint8_t XRayInitialized; +extern __xray::XRaySledMap XRayInstrMap; int __xray_set_handler(void (*entry)(int32_t, XRayEntryType)) XRAY_NEVER_INSTRUMENT { - if (XRayInitialized.load(std::memory_order_acquire)) { - __xray::XRayPatchedFunction.store(entry, std::memory_order_release); + if (__sanitizer::atomic_load(&XRayInitialized, + __sanitizer::memory_order_acquire)) { + + __sanitizer::atomic_store(&__xray::XRayPatchedFunction, + reinterpret_cast<uintptr_t>(entry), + __sanitizer::memory_order_release); + return 1; + } + return 0; +} + +int __xray_set_customevent_handler(void (*entry)(void *, size_t)) + XRAY_NEVER_INSTRUMENT { + if (__sanitizer::atomic_load(&XRayInitialized, + __sanitizer::memory_order_acquire)) { + __sanitizer::atomic_store(&__xray::XRayPatchedCustomEvent, + reinterpret_cast<uintptr_t>(entry), + __sanitizer::memory_order_release); return 1; } return 0; } + int __xray_remove_handler() XRAY_NEVER_INSTRUMENT { return __xray_set_handler(nullptr); } -std::atomic<bool> XRayPatching{false}; +int __xray_remove_customevent_handler() XRAY_NEVER_INSTRUMENT { + return __xray_set_customevent_handler(nullptr); +} + +__sanitizer::atomic_uint8_t XRayPatching{0}; using namespace __xray; @@ -115,34 +148,76 @@ public: }; template <class Function> -CleanupInvoker<Function> ScopeCleanup(Function Fn) XRAY_NEVER_INSTRUMENT { +CleanupInvoker<Function> scopeCleanup(Function Fn) XRAY_NEVER_INSTRUMENT { return CleanupInvoker<Function>{Fn}; } -// ControlPatching implements the common internals of the patching/unpatching +inline bool patchSled(const XRaySledEntry &Sled, bool Enable, + int32_t FuncId) XRAY_NEVER_INSTRUMENT { + // While we're here, we should patch the nop sled. To do that we mprotect + // the page containing the function to be writeable. + const uint64_t PageSize = GetPageSizeCached(); + void *PageAlignedAddr = + reinterpret_cast<void *>(Sled.Address & ~(PageSize - 1)); + std::size_t MProtectLen = (Sled.Address + cSledLength) - + reinterpret_cast<uint64_t>(PageAlignedAddr); + MProtectHelper Protector(PageAlignedAddr, MProtectLen); + if (Protector.MakeWriteable() == -1) { + printf("Failed mprotect: %d\n", errno); + return XRayPatchingStatus::FAILED; + } + + bool Success = false; + switch (Sled.Kind) { + case XRayEntryType::ENTRY: + Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_FunctionEntry); + break; + case XRayEntryType::EXIT: + Success = patchFunctionExit(Enable, FuncId, Sled); + break; + case XRayEntryType::TAIL: + Success = patchFunctionTailExit(Enable, FuncId, Sled); + break; + case XRayEntryType::LOG_ARGS_ENTRY: + Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry); + break; + case XRayEntryType::CUSTOM_EVENT: + Success = patchCustomEvent(Enable, FuncId, Sled); + break; + default: + Report("Unsupported sled kind '%d' @%04x\n", Sled.Address, int(Sled.Kind)); + return false; + } + return Success; +} + +// controlPatching implements the common internals of the patching/unpatching // implementation. |Enable| defines whether we're enabling or disabling the // runtime XRay instrumentation. -XRayPatchingStatus ControlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { - if (!XRayInitialized.load(std::memory_order_acquire)) +XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { + if (!__sanitizer::atomic_load(&XRayInitialized, + __sanitizer::memory_order_acquire)) return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. - static bool NotPatching = false; - if (!XRayPatching.compare_exchange_strong(NotPatching, true, - std::memory_order_acq_rel, - std::memory_order_acquire)) { + uint8_t NotPatching = false; + if (!__sanitizer::atomic_compare_exchange_strong( + &XRayPatching, &NotPatching, true, __sanitizer::memory_order_acq_rel)) return XRayPatchingStatus::ONGOING; // Already patching. - } - bool PatchingSuccess = false; - auto XRayPatchingStatusResetter = ScopeCleanup([&PatchingSuccess] { - if (!PatchingSuccess) { - XRayPatching.store(false, std::memory_order_release); - } + uint8_t PatchingSuccess = false; + auto XRayPatchingStatusResetter = scopeCleanup([&PatchingSuccess] { + if (!PatchingSuccess) + __sanitizer::atomic_store(&XRayPatching, false, + __sanitizer::memory_order_release); }); // Step 1: Compute the function id, as a unique identifier per function in the // instrumentation map. - XRaySledMap InstrMap = XRayInstrMap.load(std::memory_order_acquire); + XRaySledMap InstrMap; + { + __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); + InstrMap = XRayInstrMap; + } if (InstrMap.Entries == 0) return XRayPatchingStatus::NOT_INITIALIZED; @@ -163,45 +238,110 @@ XRayPatchingStatus ControlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { ++FuncId; CurFun = F; } - - // While we're here, we should patch the nop sled. To do that we mprotect - // the page containing the function to be writeable. - void *PageAlignedAddr = - reinterpret_cast<void *>(Sled.Address & ~(PageSize - 1)); - std::size_t MProtectLen = (Sled.Address + cSledLength) - - reinterpret_cast<uint64_t>(PageAlignedAddr); - MProtectHelper Protector(PageAlignedAddr, MProtectLen); - if (Protector.MakeWriteable() == -1) { - printf("Failed mprotect: %d\n", errno); - return XRayPatchingStatus::FAILED; - } - - bool Success = false; - switch (Sled.Kind) { - case XRayEntryType::ENTRY: - Success = patchFunctionEntry(Enable, FuncId, Sled); - break; - case XRayEntryType::EXIT: - Success = patchFunctionExit(Enable, FuncId, Sled); - break; - case XRayEntryType::TAIL: - Success = patchFunctionTailExit(Enable, FuncId, Sled); - break; - default: - Report("Unsupported sled kind: %d\n", int(Sled.Kind)); - continue; - } - (void)Success; + patchSled(Sled, Enable, FuncId); } - XRayPatching.store(false, std::memory_order_release); + __sanitizer::atomic_store(&XRayPatching, false, + __sanitizer::memory_order_release); PatchingSuccess = true; return XRayPatchingStatus::SUCCESS; } XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT { - return ControlPatching(true); + return controlPatching(true); } XRayPatchingStatus __xray_unpatch() XRAY_NEVER_INSTRUMENT { - return ControlPatching(false); + return controlPatching(false); +} + +XRayPatchingStatus patchFunction(int32_t FuncId, + bool Enable) XRAY_NEVER_INSTRUMENT { + if (!__sanitizer::atomic_load(&XRayInitialized, + __sanitizer::memory_order_acquire)) + return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. + + uint8_t NotPatching = false; + if (!__sanitizer::atomic_compare_exchange_strong( + &XRayPatching, &NotPatching, true, __sanitizer::memory_order_acq_rel)) + return XRayPatchingStatus::ONGOING; // Already patching. + + // Next, we look for the function index. + XRaySledMap InstrMap; + { + __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); + InstrMap = XRayInstrMap; + } + + // If we don't have an index, we can't patch individual functions. + if (InstrMap.Functions == 0) + return XRayPatchingStatus::NOT_INITIALIZED; + + // FuncId must be a positive number, less than the number of functions + // instrumented. + if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions) { + Report("Invalid function id provided: %d\n", FuncId); + return XRayPatchingStatus::FAILED; + } + + // Now we patch ths sleds for this specific function. + auto SledRange = InstrMap.SledsIndex[FuncId - 1]; + auto *f = SledRange.Begin; + auto *e = SledRange.End; + + bool SucceedOnce = false; + while (f != e) + SucceedOnce |= patchSled(*f++, Enable, FuncId); + + __sanitizer::atomic_store(&XRayPatching, false, + __sanitizer::memory_order_release); + + if (!SucceedOnce) { + Report("Failed patching any sled for function '%d'.", FuncId); + return XRayPatchingStatus::FAILED; + } + + return XRayPatchingStatus::SUCCESS; +} + +XRayPatchingStatus __xray_patch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { + return patchFunction(FuncId, true); +} + +XRayPatchingStatus +__xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { + return patchFunction(FuncId, false); +} + +int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) { + if (!__sanitizer::atomic_load(&XRayInitialized, + __sanitizer::memory_order_acquire)) + return 0; + + // A relaxed write might not be visible even if the current thread gets + // scheduled on a different CPU/NUMA node. We need to wait for everyone to + // have this handler installed for consistency of collected data across CPUs. + __sanitizer::atomic_store(&XRayArgLogger, reinterpret_cast<uint64_t>(entry), + __sanitizer::memory_order_release); + return 1; +} + +int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); } + +uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT { + __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); + if (FuncId <= 0 || static_cast<size_t>(FuncId) > XRayInstrMap.Functions) + return 0; + return XRayInstrMap.SledsIndex[FuncId - 1].Begin->Address +// On PPC, function entries are always aligned to 16 bytes. The beginning of a +// sled might be a local entry, which is always +8 based on the global entry. +// Always return the global entry. +#ifdef __PPC__ + & ~0xf +#endif + ; +} + +size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT { + __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); + return XRayInstrMap.Functions; } diff --git a/contrib/compiler-rt/lib/xray/xray_interface_internal.h b/contrib/compiler-rt/lib/xray/xray_interface_internal.h index a8434a6..4a27846 100644 --- a/contrib/compiler-rt/lib/xray/xray_interface_internal.h +++ b/contrib/compiler-rt/lib/xray/xray_interface_internal.h @@ -39,6 +39,11 @@ struct XRaySledEntry { #error "Unsupported word size." #endif }; + +struct XRayFunctionSledIndex { + const XRaySledEntry* Begin; + const XRaySledEntry* End; +}; } namespace __xray { @@ -46,15 +51,16 @@ namespace __xray { struct XRaySledMap { const XRaySledEntry *Sleds; size_t Entries; + const XRayFunctionSledIndex *SledsIndex; + size_t Functions; }; -uint64_t cycleFrequency(); - bool patchFunctionEntry(bool Enable, uint32_t FuncId, - const XRaySledEntry &Sled); + const XRaySledEntry &Sled, void (*Trampoline)()); bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); bool patchFunctionTailExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); +bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); } // namespace __xray @@ -64,6 +70,8 @@ extern "C" { extern void __xray_FunctionEntry(); extern void __xray_FunctionExit(); extern void __xray_FunctionTailExit(); +extern void __xray_ArgLoggerEntry(); +extern void __xray_CustomEvent(); } #endif diff --git a/contrib/compiler-rt/lib/xray/xray_log_interface.cc b/contrib/compiler-rt/lib/xray/xray_log_interface.cc new file mode 100644 index 0000000..ee14ae4 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_log_interface.cc @@ -0,0 +1,69 @@ +//===-- xray_log_interface.cc ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// +#include "xray/xray_log_interface.h" + +#include "sanitizer_common/sanitizer_atomic.h" +#include "sanitizer_common/sanitizer_mutex.h" +#include "xray/xray_interface.h" +#include "xray_defs.h" + +#include <memory> + +__sanitizer::SpinMutex XRayImplMutex; +std::unique_ptr<XRayLogImpl> GlobalXRayImpl; + +void __xray_set_log_impl(XRayLogImpl Impl) XRAY_NEVER_INSTRUMENT { + if (Impl.log_init == nullptr || Impl.log_finalize == nullptr || + Impl.handle_arg0 == nullptr || Impl.flush_log == nullptr) { + __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + GlobalXRayImpl.reset(); + __xray_remove_handler(); + __xray_remove_handler_arg1(); + return; + } + + __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + GlobalXRayImpl.reset(new XRayLogImpl); + *GlobalXRayImpl = Impl; + __xray_set_handler(Impl.handle_arg0); +} + +void __xray_remove_log_impl() XRAY_NEVER_INSTRUMENT { + __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + GlobalXRayImpl.reset(); + __xray_remove_handler(); + __xray_remove_handler_arg1(); +} + +XRayLogInitStatus __xray_log_init(size_t BufferSize, size_t MaxBuffers, + void *Args, + size_t ArgsSize) XRAY_NEVER_INSTRUMENT { + __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + if (!GlobalXRayImpl) + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + return GlobalXRayImpl->log_init(BufferSize, MaxBuffers, Args, ArgsSize); +} + +XRayLogInitStatus __xray_log_finalize() XRAY_NEVER_INSTRUMENT { + __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + if (!GlobalXRayImpl) + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + return GlobalXRayImpl->log_finalize(); +} + +XRayLogFlushStatus __xray_log_flushLog() XRAY_NEVER_INSTRUMENT { + __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + if (!GlobalXRayImpl) + return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; + return GlobalXRayImpl->flush_log(); +} diff --git a/contrib/compiler-rt/lib/xray/xray_mips.cc b/contrib/compiler-rt/lib/xray/xray_mips.cc new file mode 100644 index 0000000..cd86330 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_mips.cc @@ -0,0 +1,165 @@ +//===-- xray_mips.cc --------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Implementation of MIPS-specific routines (32-bit). +// +//===----------------------------------------------------------------------===// +#include "sanitizer_common/sanitizer_common.h" +#include "xray_defs.h" +#include "xray_interface_internal.h" +#include <atomic> + +namespace __xray { + +// The machine codes for some instructions used in runtime patching. +enum PatchOpcodes : uint32_t { + PO_ADDIU = 0x24000000, // addiu rt, rs, imm + PO_SW = 0xAC000000, // sw rt, offset(sp) + PO_LUI = 0x3C000000, // lui rs, %hi(address) + PO_ORI = 0x34000000, // ori rt, rs, %lo(address) + PO_JALR = 0x0000F809, // jalr rs + PO_LW = 0x8C000000, // lw rt, offset(address) + PO_B44 = 0x1000000b, // b #44 + PO_NOP = 0x0, // nop +}; + +enum RegNum : uint32_t { + RN_T0 = 0x8, + RN_T9 = 0x19, + RN_RA = 0x1F, + RN_SP = 0x1D, +}; + +inline static uint32_t encodeInstruction(uint32_t Opcode, uint32_t Rs, + uint32_t Rt, + uint32_t Imm) XRAY_NEVER_INSTRUMENT { + return (Opcode | Rs << 21 | Rt << 16 | Imm); +} + +inline static uint32_t +encodeSpecialInstruction(uint32_t Opcode, uint32_t Rs, uint32_t Rt, uint32_t Rd, + uint32_t Imm) XRAY_NEVER_INSTRUMENT { + return (Rs << 21 | Rt << 16 | Rd << 11 | Imm << 6 | Opcode); +} + +inline static bool patchSled(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled, + void (*TracingHook)()) XRAY_NEVER_INSTRUMENT { + // When |Enable| == true, + // We replace the following compile-time stub (sled): + // + // xray_sled_n: + // B .tmpN + // 11 NOPs (44 bytes) + // .tmpN + // ADDIU T9, T9, 44 + // + // With the following runtime patch: + // + // xray_sled_n (32-bit): + // addiu sp, sp, -8 ;create stack frame + // nop + // sw ra, 4(sp) ;save return address + // sw t9, 0(sp) ;save register t9 + // lui t9, %hi(__xray_FunctionEntry/Exit) + // ori t9, t9, %lo(__xray_FunctionEntry/Exit) + // lui t0, %hi(function_id) + // jalr t9 ;call Tracing hook + // ori t0, t0, %lo(function_id) ;pass function id (delay slot) + // lw t9, 0(sp) ;restore register t9 + // lw ra, 4(sp) ;restore return address + // addiu sp, sp, 8 ;delete stack frame + // + // We add 44 bytes to t9 because we want to adjust the function pointer to + // the actual start of function i.e. the address just after the noop sled. + // We do this because gp displacement relocation is emitted at the start of + // of the function i.e after the nop sled and to correctly calculate the + // global offset table address, t9 must hold the address of the instruction + // containing the gp displacement relocation. + // FIXME: Is this correct for the static relocation model? + // + // Replacement of the first 4-byte instruction should be the last and atomic + // operation, so that the user code which reaches the sled concurrently + // either jumps over the whole sled, or executes the whole sled when the + // latter is ready. + // + // When |Enable|==false, we set back the first instruction in the sled to be + // B #44 + + if (Enable) { + uint32_t LoTracingHookAddr = + reinterpret_cast<int32_t>(TracingHook) & 0xffff; + uint32_t HiTracingHookAddr = + (reinterpret_cast<int32_t>(TracingHook) >> 16) & 0xffff; + uint32_t LoFunctionID = FuncId & 0xffff; + uint32_t HiFunctionID = (FuncId >> 16) & 0xffff; + *reinterpret_cast<uint32_t *>(Sled.Address + 8) = encodeInstruction( + PatchOpcodes::PO_SW, RegNum::RN_SP, RegNum::RN_RA, 0x4); + *reinterpret_cast<uint32_t *>(Sled.Address + 12) = encodeInstruction( + PatchOpcodes::PO_SW, RegNum::RN_SP, RegNum::RN_T9, 0x0); + *reinterpret_cast<uint32_t *>(Sled.Address + 16) = encodeInstruction( + PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T9, HiTracingHookAddr); + *reinterpret_cast<uint32_t *>(Sled.Address + 20) = encodeInstruction( + PatchOpcodes::PO_ORI, RegNum::RN_T9, RegNum::RN_T9, LoTracingHookAddr); + *reinterpret_cast<uint32_t *>(Sled.Address + 24) = encodeInstruction( + PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T0, HiFunctionID); + *reinterpret_cast<uint32_t *>(Sled.Address + 28) = encodeSpecialInstruction( + PatchOpcodes::PO_JALR, RegNum::RN_T9, 0x0, RegNum::RN_RA, 0X0); + *reinterpret_cast<uint32_t *>(Sled.Address + 32) = encodeInstruction( + PatchOpcodes::PO_ORI, RegNum::RN_T0, RegNum::RN_T0, LoFunctionID); + *reinterpret_cast<uint32_t *>(Sled.Address + 36) = encodeInstruction( + PatchOpcodes::PO_LW, RegNum::RN_SP, RegNum::RN_T9, 0x0); + *reinterpret_cast<uint32_t *>(Sled.Address + 40) = encodeInstruction( + PatchOpcodes::PO_LW, RegNum::RN_SP, RegNum::RN_RA, 0x4); + *reinterpret_cast<uint32_t *>(Sled.Address + 44) = encodeInstruction( + PatchOpcodes::PO_ADDIU, RegNum::RN_SP, RegNum::RN_SP, 0x8); + uint32_t CreateStackSpaceInstr = encodeInstruction( + PatchOpcodes::PO_ADDIU, RegNum::RN_SP, RegNum::RN_SP, 0xFFF8); + std::atomic_store_explicit( + reinterpret_cast<std::atomic<uint32_t> *>(Sled.Address), + uint32_t(CreateStackSpaceInstr), std::memory_order_release); + } else { + std::atomic_store_explicit( + reinterpret_cast<std::atomic<uint32_t> *>(Sled.Address), + uint32_t(PatchOpcodes::PO_B44), std::memory_order_release); + } + return true; +} + +bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + return patchSled(Enable, FuncId, Sled, Trampoline); +} + +bool patchFunctionExit(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); +} + +bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: In the future we'd need to distinguish between non-tail exits and + // tail exits for better information preservation. + return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); +} + +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in mips? + return false; +} + +} // namespace __xray + +extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { + // FIXME: this will have to be implemented in the trampoline assembly file +} diff --git a/contrib/compiler-rt/lib/xray/xray_mips64.cc b/contrib/compiler-rt/lib/xray/xray_mips64.cc new file mode 100644 index 0000000..fa8fdd5 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_mips64.cc @@ -0,0 +1,173 @@ +//===-- xray_mips64.cc ------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Implementation of MIPS64-specific routines. +// +//===----------------------------------------------------------------------===// +#include "sanitizer_common/sanitizer_common.h" +#include "xray_defs.h" +#include "xray_interface_internal.h" +#include <atomic> + +namespace __xray { + +// The machine codes for some instructions used in runtime patching. +enum PatchOpcodes : uint32_t { + PO_DADDIU = 0x64000000, // daddiu rt, rs, imm + PO_SD = 0xFC000000, // sd rt, base(offset) + PO_LUI = 0x3C000000, // lui rt, imm + PO_ORI = 0x34000000, // ori rt, rs, imm + PO_DSLL = 0x00000038, // dsll rd, rt, sa + PO_JALR = 0x00000009, // jalr rs + PO_LD = 0xDC000000, // ld rt, base(offset) + PO_B60 = 0x1000000f, // b #60 + PO_NOP = 0x0, // nop +}; + +enum RegNum : uint32_t { + RN_T0 = 0xC, + RN_T9 = 0x19, + RN_RA = 0x1F, + RN_SP = 0x1D, +}; + +inline static uint32_t encodeInstruction(uint32_t Opcode, uint32_t Rs, + uint32_t Rt, + uint32_t Imm) XRAY_NEVER_INSTRUMENT { + return (Opcode | Rs << 21 | Rt << 16 | Imm); +} + +inline static uint32_t +encodeSpecialInstruction(uint32_t Opcode, uint32_t Rs, uint32_t Rt, uint32_t Rd, + uint32_t Imm) XRAY_NEVER_INSTRUMENT { + return (Rs << 21 | Rt << 16 | Rd << 11 | Imm << 6 | Opcode); +} + +inline static bool patchSled(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled, + void (*TracingHook)()) XRAY_NEVER_INSTRUMENT { + // When |Enable| == true, + // We replace the following compile-time stub (sled): + // + // xray_sled_n: + // B .tmpN + // 15 NOPs (60 bytes) + // .tmpN + // + // With the following runtime patch: + // + // xray_sled_n (64-bit): + // daddiu sp, sp, -16 ;create stack frame + // nop + // sd ra, 8(sp) ;save return address + // sd t9, 0(sp) ;save register t9 + // lui t9, %highest(__xray_FunctionEntry/Exit) + // ori t9, t9, %higher(__xray_FunctionEntry/Exit) + // dsll t9, t9, 16 + // ori t9, t9, %hi(__xray_FunctionEntry/Exit) + // dsll t9, t9, 16 + // ori t9, t9, %lo(__xray_FunctionEntry/Exit) + // lui t0, %hi(function_id) + // jalr t9 ;call Tracing hook + // ori t0, t0, %lo(function_id) ;pass function id (delay slot) + // ld t9, 0(sp) ;restore register t9 + // ld ra, 8(sp) ;restore return address + // daddiu sp, sp, 16 ;delete stack frame + // + // Replacement of the first 4-byte instruction should be the last and atomic + // operation, so that the user code which reaches the sled concurrently + // either jumps over the whole sled, or executes the whole sled when the + // latter is ready. + // + // When |Enable|==false, we set back the first instruction in the sled to be + // B #60 + + if (Enable) { + uint32_t LoTracingHookAddr = + reinterpret_cast<int64_t>(TracingHook) & 0xffff; + uint32_t HiTracingHookAddr = + (reinterpret_cast<int64_t>(TracingHook) >> 16) & 0xffff; + uint32_t HigherTracingHookAddr = + (reinterpret_cast<int64_t>(TracingHook) >> 32) & 0xffff; + uint32_t HighestTracingHookAddr = + (reinterpret_cast<int64_t>(TracingHook) >> 48) & 0xffff; + uint32_t LoFunctionID = FuncId & 0xffff; + uint32_t HiFunctionID = (FuncId >> 16) & 0xffff; + *reinterpret_cast<uint32_t *>(Sled.Address + 8) = encodeInstruction( + PatchOpcodes::PO_SD, RegNum::RN_SP, RegNum::RN_RA, 0x8); + *reinterpret_cast<uint32_t *>(Sled.Address + 12) = encodeInstruction( + PatchOpcodes::PO_SD, RegNum::RN_SP, RegNum::RN_T9, 0x0); + *reinterpret_cast<uint32_t *>(Sled.Address + 16) = encodeInstruction( + PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T9, HighestTracingHookAddr); + *reinterpret_cast<uint32_t *>(Sled.Address + 20) = + encodeInstruction(PatchOpcodes::PO_ORI, RegNum::RN_T9, RegNum::RN_T9, + HigherTracingHookAddr); + *reinterpret_cast<uint32_t *>(Sled.Address + 24) = encodeSpecialInstruction( + PatchOpcodes::PO_DSLL, 0x0, RegNum::RN_T9, RegNum::RN_T9, 0x10); + *reinterpret_cast<uint32_t *>(Sled.Address + 28) = encodeInstruction( + PatchOpcodes::PO_ORI, RegNum::RN_T9, RegNum::RN_T9, HiTracingHookAddr); + *reinterpret_cast<uint32_t *>(Sled.Address + 32) = encodeSpecialInstruction( + PatchOpcodes::PO_DSLL, 0x0, RegNum::RN_T9, RegNum::RN_T9, 0x10); + *reinterpret_cast<uint32_t *>(Sled.Address + 36) = encodeInstruction( + PatchOpcodes::PO_ORI, RegNum::RN_T9, RegNum::RN_T9, LoTracingHookAddr); + *reinterpret_cast<uint32_t *>(Sled.Address + 40) = encodeInstruction( + PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T0, HiFunctionID); + *reinterpret_cast<uint32_t *>(Sled.Address + 44) = encodeSpecialInstruction( + PatchOpcodes::PO_JALR, RegNum::RN_T9, 0x0, RegNum::RN_RA, 0X0); + *reinterpret_cast<uint32_t *>(Sled.Address + 48) = encodeInstruction( + PatchOpcodes::PO_ORI, RegNum::RN_T0, RegNum::RN_T0, LoFunctionID); + *reinterpret_cast<uint32_t *>(Sled.Address + 52) = encodeInstruction( + PatchOpcodes::PO_LD, RegNum::RN_SP, RegNum::RN_T9, 0x0); + *reinterpret_cast<uint32_t *>(Sled.Address + 56) = encodeInstruction( + PatchOpcodes::PO_LD, RegNum::RN_SP, RegNum::RN_RA, 0x8); + *reinterpret_cast<uint32_t *>(Sled.Address + 60) = encodeInstruction( + PatchOpcodes::PO_DADDIU, RegNum::RN_SP, RegNum::RN_SP, 0x10); + uint32_t CreateStackSpace = encodeInstruction( + PatchOpcodes::PO_DADDIU, RegNum::RN_SP, RegNum::RN_SP, 0xfff0); + std::atomic_store_explicit( + reinterpret_cast<std::atomic<uint32_t> *>(Sled.Address), + CreateStackSpace, std::memory_order_release); + } else { + std::atomic_store_explicit( + reinterpret_cast<std::atomic<uint32_t> *>(Sled.Address), + uint32_t(PatchOpcodes::PO_B60), std::memory_order_release); + } + return true; +} + +bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + return patchSled(Enable, FuncId, Sled, Trampoline); +} + +bool patchFunctionExit(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); +} + +bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: In the future we'd need to distinguish between non-tail exits and + // tail exits for better information preservation. + return patchSled(Enable, FuncId, Sled, __xray_FunctionExit); +} + +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in mips64? + return false; +} +} // namespace __xray + +extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { + // FIXME: this will have to be implemented in the trampoline assembly file +} diff --git a/contrib/compiler-rt/lib/xray/xray_never_instrument.txt b/contrib/compiler-rt/lib/xray/xray_never_instrument.txt new file mode 100644 index 0000000..7fa48dd --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_never_instrument.txt @@ -0,0 +1,6 @@ +# List of function matchers common to C/C++ applications that make sense to +# never instrument. You can use this as an argument to +# -fxray-never-instrument=<path> along with your project-specific lists. + +# Never instrument any function whose symbol starts with __xray. +fun:__xray* diff --git a/contrib/compiler-rt/lib/xray/xray_powerpc64.cc b/contrib/compiler-rt/lib/xray/xray_powerpc64.cc new file mode 100644 index 0000000..ab03cb1 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_powerpc64.cc @@ -0,0 +1,106 @@ +//===-- xray_powerpc64.cc ---------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Implementation of powerpc64 and powerpc64le routines. +// +//===----------------------------------------------------------------------===// +#include "sanitizer_common/sanitizer_common.h" +#include "xray_defs.h" +#include "xray_interface_internal.h" +#include "xray_utils.h" +#include <atomic> +#include <cassert> +#include <cstring> + +#ifndef __LITTLE_ENDIAN__ +#error powerpc64 big endian is not supported for now. +#endif + +namespace { + +constexpr unsigned long long JumpOverInstNum = 7; + +void clearCache(void *Addr, size_t Len) { + const size_t LineSize = 32; + + const intptr_t Mask = ~(LineSize - 1); + const intptr_t StartLine = ((intptr_t)Addr) & Mask; + const intptr_t EndLine = ((intptr_t)Addr + Len + LineSize - 1) & Mask; + + for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize) + asm volatile("dcbf 0, %0" : : "r"(Line)); + asm volatile("sync"); + + for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize) + asm volatile("icbi 0, %0" : : "r"(Line)); + asm volatile("isync"); +} + +} // namespace + +extern "C" void __clear_cache(void *start, void *end); + +namespace __xray { + +bool patchFunctionEntry(const bool Enable, uint32_t FuncId, + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + if (Enable) { + // lis 0, FuncId[16..32] + // li 0, FuncId[0..15] + *reinterpret_cast<uint64_t *>(Sled.Address) = + (0x3c000000ull + (FuncId >> 16)) + + ((0x60000000ull + (FuncId & 0xffff)) << 32); + } else { + // b +JumpOverInstNum instructions. + *reinterpret_cast<uint32_t *>(Sled.Address) = + 0x48000000ull + (JumpOverInstNum << 2); + } + clearCache(reinterpret_cast<void *>(Sled.Address), 8); + return true; +} + +bool patchFunctionExit(const bool Enable, uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + if (Enable) { + // lis 0, FuncId[16..32] + // li 0, FuncId[0..15] + *reinterpret_cast<uint64_t *>(Sled.Address) = + (0x3c000000ull + (FuncId >> 16)) + + ((0x60000000ull + (FuncId & 0xffff)) << 32); + } else { + // Copy the blr/b instruction after JumpOverInstNum instructions. + *reinterpret_cast<uint32_t *>(Sled.Address) = + *(reinterpret_cast<uint32_t *>(Sled.Address) + JumpOverInstNum); + } + clearCache(reinterpret_cast<void *>(Sled.Address), 8); + return true; +} + +bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + return patchFunctionExit(Enable, FuncId, Sled); +} + +// FIXME: Maybe implement this better? +bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } + +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // FIXME: Implement in powerpc64? + return false; +} + +} // namespace __xray + +extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT { + // FIXME: this will have to be implemented in the trampoline assembly file +} diff --git a/contrib/compiler-rt/lib/xray/xray_powerpc64.inc b/contrib/compiler-rt/lib/xray/xray_powerpc64.inc new file mode 100644 index 0000000..c1a1bac --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_powerpc64.inc @@ -0,0 +1,37 @@ +//===-- xray_powerpc64.inc --------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +//===----------------------------------------------------------------------===// + +#include <cstdint> +#include <mutex> +#include <sys/platform/ppc.h> + +#include "xray_defs.h" + +namespace __xray { + +ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT { + CPU = 0; + return __ppc_get_timebase(); +} + +inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { + static std::mutex M; + std::lock_guard<std::mutex> Guard(M); + return __ppc_get_timebase_freq(); +} + +inline bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { + return true; +} + +} // namespace __xray diff --git a/contrib/compiler-rt/lib/xray/xray_trampoline_AArch64.S b/contrib/compiler-rt/lib/xray/xray_trampoline_AArch64.S index f1a471c..4d1b04f 100644 --- a/contrib/compiler-rt/lib/xray/xray_trampoline_AArch64.S +++ b/contrib/compiler-rt/lib/xray/xray_trampoline_AArch64.S @@ -1,3 +1,5 @@ +#include "../builtins/assembly.h" + .text /* The variable containing the handler function pointer */ .global _ZN6__xray19XRayPatchedFunctionE @@ -87,3 +89,56 @@ FunctionExit_restore: LDP X3, X4, [SP], #16 LDP X1, X2, [SP], #16 RET + + /* Word-aligned function entry point */ + .p2align 2 + /* Let C/C++ see the symbol */ + .global __xray_FunctionTailExit + .type __xray_FunctionTailExit, %function + /* In C++ it is void extern "C" __xray_FunctionTailExit(uint32_t FuncId) + with FuncId passed in W0 register. */ +__xray_FunctionTailExit: + /* Move the return address beyond the end of sled data. The 12 bytes of + data are inserted in the code of the runtime patch, between the call + instruction and the instruction returned into. The data contains 32 + bits of instrumented function ID and 64 bits of the address of + the current trampoline. */ + ADD X30, X30, #12 + /* Push the registers which may be modified by the handler function */ + STP X1, X2, [SP, #-16]! + STP X3, X4, [SP, #-16]! + STP X5, X6, [SP, #-16]! + STP X7, X30, [SP, #-16]! + /* Push the parameters of the tail called function */ + STP Q0, Q1, [SP, #-32]! + STP Q2, Q3, [SP, #-32]! + STP Q4, Q5, [SP, #-32]! + STP Q6, Q7, [SP, #-32]! + /* Load the address of _ZN6__xray19XRayPatchedFunctionE into X1 */ + LDR X1, =_ZN6__xray19XRayPatchedFunctionE + /* Load the handler function pointer into X2 */ + LDR X2, [X1] + /* Handler address is nullptr if handler is not set */ + CMP X2, #0 + BEQ FunctionTailExit_restore + /* Function ID is already in W0 (the first parameter). + X1=2 means that we are tracing a tail exit event, but before the + logging part of XRay is ready, we pretend that here a normal function + exit happens, so we give the handler code 1 */ + MOV X1, #1 + /* Call the handler with 2 parameters in W0 and X1 */ + BLR X2 +FunctionTailExit_restore: + /* Pop the parameters of the tail called function */ + LDP Q6, Q7, [SP], #32 + LDP Q4, Q5, [SP], #32 + LDP Q2, Q3, [SP], #32 + LDP Q0, Q1, [SP], #32 + /* Pop the registers which may be modified by the handler function */ + LDP X7, X30, [SP], #16 + LDP X5, X6, [SP], #16 + LDP X3, X4, [SP], #16 + LDP X1, X2, [SP], #16 + RET + +NO_EXEC_STACK_DIRECTIVE diff --git a/contrib/compiler-rt/lib/xray/xray_trampoline_arm.S b/contrib/compiler-rt/lib/xray/xray_trampoline_arm.S index 5d87c97..71dbee6 100644 --- a/contrib/compiler-rt/lib/xray/xray_trampoline_arm.S +++ b/contrib/compiler-rt/lib/xray/xray_trampoline_arm.S @@ -1,8 +1,11 @@ +#include "../builtins/assembly.h" + .syntax unified .arch armv6t2 .fpu vfpv2 .code 32 .global _ZN6__xray19XRayPatchedFunctionE + @ Word-aligned function entry point .p2align 2 @ Let C/C++ see the symbol @@ -63,3 +66,37 @@ FunctionExit_restore: @ Restore the floating-point return value of the instrumented function VPOP {d0} POP {r1-r3,pc} + + @ Word-aligned function entry point + .p2align 2 + @ Let C/C++ see the symbol + .global __xray_FunctionTailExit + @ It preserves all registers except r0, r12(ip), r14(lr) and r15(pc) + @ Assume that "q" part of the floating-point registers is not used + @ for passing parameters to C/C++ functions. + .type __xray_FunctionTailExit, %function + @ In C++ it is void extern "C" __xray_FunctionTailExit(uint32_t FuncId) + @ with FuncId passed in r0 register. +__xray_FunctionTailExit: + PUSH {r1-r3,lr} + @ Save floating-point parameters of the instrumented function + VPUSH {d0-d7} + MOVW r1,#:lower16:_ZN6__xray19XRayPatchedFunctionE + MOVT r1,#:upper16:_ZN6__xray19XRayPatchedFunctionE + LDR r2, [r1] + @ Handler address is nullptr if handler is not set + CMP r2, #0 + BEQ FunctionTailExit_restore + @ Function ID is already in r0 (the first parameter). + @ r1=2 means that we are tracing a tail exit event + @ But before the logging part of XRay is ready, we pretend that here a + @ normal function exit happens, so we give the handler code 1 + MOV r1, #1 + @ Call the handler with 2 parameters in r0 and r1 + BLX r2 +FunctionTailExit_restore: + @ Restore floating-point parameters of the instrumented function + VPOP {d0-d7} + POP {r1-r3,pc} + +NO_EXEC_STACK_DIRECTIVE diff --git a/contrib/compiler-rt/lib/xray/xray_trampoline_mips.S b/contrib/compiler-rt/lib/xray/xray_trampoline_mips.S new file mode 100644 index 0000000..39a1a3a --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_trampoline_mips.S @@ -0,0 +1,110 @@ +//===-- xray_trampoline_mips.s ----------------------------------*- ASM -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This implements the MIPS-specific assembler for the trampolines. +// +//===----------------------------------------------------------------------===// + + .text + .file "xray_trampoline_mips.S" + .globl __xray_FunctionEntry + .p2align 2 + .type __xray_FunctionEntry,@function +__xray_FunctionEntry: + .cfi_startproc + .set noreorder + .cpload $t9 + .set reorder + // Save argument registers before doing any actual work + .cfi_def_cfa_offset 36 + addiu $sp, $sp, -36 + sw $ra, 32($sp) + .cfi_offset 31, -4 + sw $a3, 28($sp) + sw $a2, 24($sp) + sw $a1, 20($sp) + sw $a0, 16($sp) + sdc1 $f14, 8($sp) + sdc1 $f12, 0($sp) + + la $t9, _ZN6__xray19XRayPatchedFunctionE + lw $t9, 0($t9) + + beqz $t9, FunctionEntry_restore + + // a1=0 means that we are tracing an entry event + move $a1, $zero + // Function ID is in t0 (the first parameter). + move $a0, $t0 + jalr $t9 + +FunctionEntry_restore: + // Restore argument registers + ldc1 $f12, 0($sp) + ldc1 $f14, 8($sp) + lw $a0, 16($sp) + lw $a1, 20($sp) + lw $a2, 24($sp) + lw $a3, 28($sp) + lw $ra, 32($sp) + addiu $sp, $sp, 36 + jr $ra +FunctionEntry_end: + .size __xray_FunctionEntry, FunctionEntry_end-__xray_FunctionEntry + .cfi_endproc + + .text + .globl __xray_FunctionExit + .p2align 2 + .type __xray_FunctionExit,@function +__xray_FunctionExit: + .cfi_startproc + .set noreorder + .cpload $t9 + .set reorder + // Save return registers before doing any actual work. + .cfi_def_cfa_offset 36 + addiu $sp, $sp, -36 + sw $ra, 32($sp) + .cfi_offset 31, -4 + sw $a1, 28($sp) + sw $a0, 24($sp) + sw $v1, 20($sp) + sw $v0, 16($sp) + sdc1 $f2, 8($sp) + sdc1 $f0, 0($sp) + + la $t9, _ZN6__xray19XRayPatchedFunctionE + lw $t9, 0($t9) + + beqz $t9, FunctionExit_restore + + // a1=1 means that we are tracing an exit event + li $a1, 1 + // Function ID is in t0 (the first parameter). + move $a0, $t0 + jalr $t9 + +FunctionExit_restore: + // Restore return registers + ldc1 $f0, 0($sp) + ldc1 $f2, 8($sp) + lw $v0, 16($sp) + lw $v1, 20($sp) + lw $a0, 24($sp) + lw $a1, 28($sp) + lw $ra, 32($sp) + addiu $sp, $sp, 36 + jr $ra + +FunctionExit_end: + .size __xray_FunctionExit, FunctionExit_end-__xray_FunctionExit + .cfi_endproc diff --git a/contrib/compiler-rt/lib/xray/xray_trampoline_mips64.S b/contrib/compiler-rt/lib/xray/xray_trampoline_mips64.S new file mode 100644 index 0000000..9cbc7e1 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_trampoline_mips64.S @@ -0,0 +1,136 @@ +//===-- xray_trampoline_mips64.s --------------------------------*- ASM -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This implements the MIPS64-specific assembler for the trampolines. +// +//===----------------------------------------------------------------------===// + + .text + .file "xray_trampoline_mips64.S" + .globl __xray_FunctionEntry + .p2align 2 + .type __xray_FunctionEntry,@function +__xray_FunctionEntry: + .cfi_startproc + // Save argument registers before doing any actual work. + .cfi_def_cfa_offset 144 + daddiu $sp, $sp, -144 + sd $ra, 136($sp) + .cfi_offset 31, -8 + sd $gp, 128($sp) + sd $a7, 120($sp) + sd $a6, 112($sp) + sd $a5, 104($sp) + sd $a4, 96($sp) + sd $a3, 88($sp) + sd $a2, 80($sp) + sd $a1, 72($sp) + sd $a0, 64($sp) + sdc1 $f19, 56($sp) + sdc1 $f18, 48($sp) + sdc1 $f17, 40($sp) + sdc1 $f16, 32($sp) + sdc1 $f15, 24($sp) + sdc1 $f14, 16($sp) + sdc1 $f13, 8($sp) + sdc1 $f12, 0($sp) + + lui $gp, %hi(%neg(%gp_rel(__xray_FunctionEntry))) + daddu $gp, $gp, $t9 + daddiu $gp ,$gp, %lo(%neg(%gp_rel(__xray_FunctionEntry))) + + dla $t9, _ZN6__xray19XRayPatchedFunctionE + ld $t9, 0($t9) + + beqz $t9, FunctionEntry_restore + + // a1=0 means that we are tracing an entry event + move $a1, $zero + // Function ID is in t0 (the first parameter). + move $a0, $t0 + jalr $t9 + +FunctionEntry_restore: + // Restore argument registers + ldc1 $f12, 0($sp) + ldc1 $f13, 8($sp) + ldc1 $f14, 16($sp) + ldc1 $f15, 24($sp) + ldc1 $f16, 32($sp) + ldc1 $f17, 40($sp) + ldc1 $f18, 48($sp) + ldc1 $f19, 56($sp) + ld $a0, 64($sp) + ld $a1, 72($sp) + ld $a2, 80($sp) + ld $a3, 88($sp) + ld $a4, 96($sp) + ld $a5, 104($sp) + ld $a6, 112($sp) + ld $a7, 120($sp) + ld $gp, 128($sp) + ld $ra, 136($sp) + daddiu $sp, $sp, 144 + jr $ra +FunctionEntry_end: + .size __xray_FunctionEntry, FunctionEntry_end-__xray_FunctionEntry + .cfi_endproc + + .text + .globl __xray_FunctionExit + .p2align 2 + .type __xray_FunctionExit,@function +__xray_FunctionExit: + .cfi_startproc + // Save return registers before doing any actual work. + .cfi_def_cfa_offset 64 + daddiu $sp, $sp, -64 + sd $ra, 56($sp) + .cfi_offset 31, -8 + sd $gp, 48($sp) + sd $a0, 40($sp) + sd $v1, 32($sp) + sd $v0, 24($sp) + sdc1 $f2, 16($sp) + sdc1 $f1, 8($sp) + sdc1 $f0, 0($sp) + + lui $gp, %hi(%neg(%gp_rel(__xray_FunctionExit))) + daddu $gp, $gp, $t9 + daddiu $gp ,$gp, %lo(%neg(%gp_rel(__xray_FunctionExit))) + + dla $t9, _ZN6__xray19XRayPatchedFunctionE + ld $t9, 0($t9) + + beqz $t9, FunctionExit_restore + + // a1=1 means that we are tracing an exit event + li $a1, 1 + // Function ID is in t0 (the first parameter). + move $a0, $t0 + jalr $t9 + +FunctionExit_restore: + // Restore return registers + ldc1 $f0, 0($sp) + ldc1 $f1, 8($sp) + ldc1 $f2, 16($sp) + ld $v0, 24($sp) + ld $v1, 32($sp) + ld $a0, 40($sp) + ld $gp, 48($sp) + ld $ra, 56($sp) + daddiu $sp, $sp, 64 + jr $ra + +FunctionExit_end: + .size __xray_FunctionExit, FunctionExit_end-__xray_FunctionExit + .cfi_endproc diff --git a/contrib/compiler-rt/lib/xray/xray_trampoline_powerpc64.cc b/contrib/compiler-rt/lib/xray/xray_trampoline_powerpc64.cc new file mode 100644 index 0000000..878c469 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_trampoline_powerpc64.cc @@ -0,0 +1,15 @@ +#include <atomic> +#include <xray/xray_interface.h> + +namespace __xray { + +extern std::atomic<void (*)(int32_t, XRayEntryType)> XRayPatchedFunction; + +// Implement this in C++ instead of assembly, to avoid dealing with ToC by hand. +void CallXRayPatchedFunction(int32_t FuncId, XRayEntryType Type) { + auto fptr = __xray::XRayPatchedFunction.load(); + if (fptr != nullptr) + (*fptr)(FuncId, Type); +} + +} // namespace __xray diff --git a/contrib/compiler-rt/lib/xray/xray_trampoline_powerpc64_asm.S b/contrib/compiler-rt/lib/xray/xray_trampoline_powerpc64_asm.S new file mode 100644 index 0000000..250e2e5b --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_trampoline_powerpc64_asm.S @@ -0,0 +1,235 @@ + .text + .abiversion 2 + .globl __xray_FunctionEntry + .p2align 4 +__xray_FunctionEntry: + std 0, 16(1) + stdu 1, -408(1) +# Spill r3-r10, f1-f13, and vsr34-vsr45, which are parameter registers. +# If this appears to be slow, the caller needs to pass in number of generic, +# floating point, and vector parameters, so that we only spill those live ones. + std 3, 32(1) + ld 3, 400(1) # FuncId + std 4, 40(1) + std 5, 48(1) + std 6, 56(1) + std 7, 64(1) + std 8, 72(1) + std 9, 80(1) + std 10, 88(1) + addi 4, 1, 96 + stxsdx 1, 0, 4 + addi 4, 1, 104 + stxsdx 2, 0, 4 + addi 4, 1, 112 + stxsdx 3, 0, 4 + addi 4, 1, 120 + stxsdx 4, 0, 4 + addi 4, 1, 128 + stxsdx 5, 0, 4 + addi 4, 1, 136 + stxsdx 6, 0, 4 + addi 4, 1, 144 + stxsdx 7, 0, 4 + addi 4, 1, 152 + stxsdx 8, 0, 4 + addi 4, 1, 160 + stxsdx 9, 0, 4 + addi 4, 1, 168 + stxsdx 10, 0, 4 + addi 4, 1, 176 + stxsdx 11, 0, 4 + addi 4, 1, 184 + stxsdx 12, 0, 4 + addi 4, 1, 192 + stxsdx 13, 0, 4 + addi 4, 1, 200 + stxvd2x 34, 0, 4 + addi 4, 1, 216 + stxvd2x 35, 0, 4 + addi 4, 1, 232 + stxvd2x 36, 0, 4 + addi 4, 1, 248 + stxvd2x 37, 0, 4 + addi 4, 1, 264 + stxvd2x 38, 0, 4 + addi 4, 1, 280 + stxvd2x 39, 0, 4 + addi 4, 1, 296 + stxvd2x 40, 0, 4 + addi 4, 1, 312 + stxvd2x 41, 0, 4 + addi 4, 1, 328 + stxvd2x 42, 0, 4 + addi 4, 1, 344 + stxvd2x 43, 0, 4 + addi 4, 1, 360 + stxvd2x 44, 0, 4 + addi 4, 1, 376 + stxvd2x 45, 0, 4 + std 2, 392(1) + mflr 0 + std 0, 400(1) + + li 4, 0 + bl _ZN6__xray23CallXRayPatchedFunctionEi13XRayEntryType + nop + + addi 4, 1, 96 + lxsdx 1, 0, 4 + addi 4, 1, 104 + lxsdx 2, 0, 4 + addi 4, 1, 112 + lxsdx 3, 0, 4 + addi 4, 1, 120 + lxsdx 4, 0, 4 + addi 4, 1, 128 + lxsdx 5, 0, 4 + addi 4, 1, 136 + lxsdx 6, 0, 4 + addi 4, 1, 144 + lxsdx 7, 0, 4 + addi 4, 1, 152 + lxsdx 8, 0, 4 + addi 4, 1, 160 + lxsdx 9, 0, 4 + addi 4, 1, 168 + lxsdx 10, 0, 4 + addi 4, 1, 176 + lxsdx 11, 0, 4 + addi 4, 1, 184 + lxsdx 12, 0, 4 + addi 4, 1, 192 + lxsdx 13, 0, 4 + addi 4, 1, 200 + lxvd2x 34, 0, 4 + addi 4, 1, 216 + lxvd2x 35, 0, 4 + addi 4, 1, 232 + lxvd2x 36, 0, 4 + addi 4, 1, 248 + lxvd2x 37, 0, 4 + addi 4, 1, 264 + lxvd2x 38, 0, 4 + addi 4, 1, 280 + lxvd2x 39, 0, 4 + addi 4, 1, 296 + lxvd2x 40, 0, 4 + addi 4, 1, 312 + lxvd2x 41, 0, 4 + addi 4, 1, 328 + lxvd2x 42, 0, 4 + addi 4, 1, 344 + lxvd2x 43, 0, 4 + addi 4, 1, 360 + lxvd2x 44, 0, 4 + addi 4, 1, 376 + lxvd2x 45, 0, 4 + ld 0, 400(1) + mtlr 0 + ld 2, 392(1) + ld 3, 32(1) + ld 4, 40(1) + ld 5, 48(1) + ld 6, 56(1) + ld 7, 64(1) + ld 8, 72(1) + ld 9, 80(1) + ld 10, 88(1) + + addi 1, 1, 408 + ld 0, 16(1) + blr + + .globl __xray_FunctionExit + .p2align 4 +__xray_FunctionExit: + std 0, 16(1) + stdu 1, -256(1) +# Spill r3-r4, f1-f8, and vsr34-vsr41, which are return registers. +# If this appears to be slow, the caller needs to pass in number of generic, +# floating point, and vector parameters, so that we only spill those live ones. + std 3, 32(1) + ld 3, 248(1) # FuncId + std 4, 40(1) + addi 4, 1, 48 + stxsdx 1, 0, 4 + addi 4, 1, 56 + stxsdx 2, 0, 4 + addi 4, 1, 64 + stxsdx 3, 0, 4 + addi 4, 1, 72 + stxsdx 4, 0, 4 + addi 4, 1, 80 + stxsdx 5, 0, 4 + addi 4, 1, 88 + stxsdx 6, 0, 4 + addi 4, 1, 96 + stxsdx 7, 0, 4 + addi 4, 1, 104 + stxsdx 8, 0, 4 + addi 4, 1, 112 + stxvd2x 34, 0, 4 + addi 4, 1, 128 + stxvd2x 35, 0, 4 + addi 4, 1, 144 + stxvd2x 36, 0, 4 + addi 4, 1, 160 + stxvd2x 37, 0, 4 + addi 4, 1, 176 + stxvd2x 38, 0, 4 + addi 4, 1, 192 + stxvd2x 39, 0, 4 + addi 4, 1, 208 + stxvd2x 40, 0, 4 + addi 4, 1, 224 + stxvd2x 41, 0, 4 + std 2, 240(1) + mflr 0 + std 0, 248(1) + + li 4, 1 + bl _ZN6__xray23CallXRayPatchedFunctionEi13XRayEntryType + nop + + addi 4, 1, 48 + lxsdx 1, 0, 4 + addi 4, 1, 56 + lxsdx 2, 0, 4 + addi 4, 1, 64 + lxsdx 3, 0, 4 + addi 4, 1, 72 + lxsdx 4, 0, 4 + addi 4, 1, 80 + lxsdx 5, 0, 4 + addi 4, 1, 88 + lxsdx 6, 0, 4 + addi 4, 1, 96 + lxsdx 7, 0, 4 + addi 4, 1, 104 + lxsdx 8, 0, 4 + addi 4, 1, 112 + lxvd2x 34, 0, 4 + addi 4, 1, 128 + lxvd2x 35, 0, 4 + addi 4, 1, 144 + lxvd2x 36, 0, 4 + addi 4, 1, 160 + lxvd2x 37, 0, 4 + addi 4, 1, 176 + lxvd2x 38, 0, 4 + addi 4, 1, 192 + lxvd2x 39, 0, 4 + addi 4, 1, 208 + lxvd2x 40, 0, 4 + addi 4, 1, 224 + lxvd2x 41, 0, 4 + ld 0, 248(1) + mtlr 0 + ld 2, 240(1) + ld 3, 32(1) + ld 4, 40(1) + + addi 1, 1, 256 + ld 0, 16(1) + blr diff --git a/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S b/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S index d90c30c..b59eedc 100644 --- a/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S +++ b/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S @@ -13,54 +13,64 @@ // //===----------------------------------------------------------------------===// +#include "../builtins/assembly.h" + .macro SAVE_REGISTERS - subq $200, %rsp - movupd %xmm0, 184(%rsp) - movupd %xmm1, 168(%rsp) - movupd %xmm2, 152(%rsp) - movupd %xmm3, 136(%rsp) - movupd %xmm4, 120(%rsp) - movupd %xmm5, 104(%rsp) - movupd %xmm6, 88(%rsp) - movupd %xmm7, 72(%rsp) - movq %rdi, 64(%rsp) - movq %rax, 56(%rsp) - movq %rdx, 48(%rsp) - movq %rsi, 40(%rsp) - movq %rcx, 32(%rsp) - movq %r8, 24(%rsp) - movq %r9, 16(%rsp) + subq $192, %rsp + .cfi_def_cfa_offset 200 + // At this point, the stack pointer should be aligned to an 8-byte boundary, + // because any call instructions that come after this will add another 8 + // bytes and therefore align it to 16-bytes. + movq %rbp, 184(%rsp) + movupd %xmm0, 168(%rsp) + movupd %xmm1, 152(%rsp) + movupd %xmm2, 136(%rsp) + movupd %xmm3, 120(%rsp) + movupd %xmm4, 104(%rsp) + movupd %xmm5, 88(%rsp) + movupd %xmm6, 72(%rsp) + movupd %xmm7, 56(%rsp) + movq %rdi, 48(%rsp) + movq %rax, 40(%rsp) + movq %rdx, 32(%rsp) + movq %rsi, 24(%rsp) + movq %rcx, 16(%rsp) + movq %r8, 8(%rsp) + movq %r9, 0(%rsp) .endm .macro RESTORE_REGISTERS - movupd 184(%rsp), %xmm0 - movupd 168(%rsp), %xmm1 - movupd 152(%rsp), %xmm2 - movupd 136(%rsp), %xmm3 - movupd 120(%rsp), %xmm4 - movupd 104(%rsp), %xmm5 - movupd 88(%rsp) , %xmm6 - movupd 72(%rsp) , %xmm7 - movq 64(%rsp), %rdi - movq 56(%rsp), %rax - movq 48(%rsp), %rdx - movq 40(%rsp), %rsi - movq 32(%rsp), %rcx - movq 24(%rsp), %r8 - movq 16(%rsp), %r9 - addq $200, %rsp + movq 184(%rsp), %rbp + movupd 168(%rsp), %xmm0 + movupd 152(%rsp), %xmm1 + movupd 136(%rsp), %xmm2 + movupd 120(%rsp), %xmm3 + movupd 104(%rsp), %xmm4 + movupd 88(%rsp), %xmm5 + movupd 72(%rsp) , %xmm6 + movupd 56(%rsp) , %xmm7 + movq 48(%rsp), %rdi + movq 40(%rsp), %rax + movq 32(%rsp), %rdx + movq 24(%rsp), %rsi + movq 16(%rsp), %rcx + movq 8(%rsp), %r8 + movq 0(%rsp), %r9 + addq $192, %rsp + .cfi_def_cfa_offset 8 .endm .text .file "xray_trampoline_x86.S" + +//===----------------------------------------------------------------------===// + .globl __xray_FunctionEntry .align 16, 0x90 .type __xray_FunctionEntry,@function __xray_FunctionEntry: .cfi_startproc - pushq %rbp - .cfi_def_cfa_offset 16 SAVE_REGISTERS // This load has to be atomic, it's concurrent with __xray_patch(). @@ -75,12 +85,13 @@ __xray_FunctionEntry: callq *%rax .Ltmp0: RESTORE_REGISTERS - popq %rbp retq .Ltmp1: .size __xray_FunctionEntry, .Ltmp1-__xray_FunctionEntry .cfi_endproc +//===----------------------------------------------------------------------===// + .globl __xray_FunctionExit .align 16, 0x90 .type __xray_FunctionExit,@function @@ -89,14 +100,13 @@ __xray_FunctionExit: // Save the important registers first. Since we're assuming that this // function is only jumped into, we only preserve the registers for // returning. - pushq %rbp - .cfi_def_cfa_offset 16 subq $56, %rsp - .cfi_def_cfa_offset 32 - movupd %xmm0, 40(%rsp) - movupd %xmm1, 24(%rsp) - movq %rax, 16(%rsp) - movq %rdx, 8(%rsp) + .cfi_def_cfa_offset 64 + movq %rbp, 48(%rsp) + movupd %xmm0, 32(%rsp) + movupd %xmm1, 16(%rsp) + movq %rax, 8(%rsp) + movq %rdx, 0(%rsp) movq _ZN6__xray19XRayPatchedFunctionE(%rip), %rax testq %rax,%rax je .Ltmp2 @@ -106,17 +116,20 @@ __xray_FunctionExit: callq *%rax .Ltmp2: // Restore the important registers. - movupd 40(%rsp), %xmm0 - movupd 24(%rsp), %xmm1 - movq 16(%rsp), %rax - movq 8(%rsp), %rdx + movq 48(%rsp), %rbp + movupd 32(%rsp), %xmm0 + movupd 16(%rsp), %xmm1 + movq 8(%rsp), %rax + movq 0(%rsp), %rdx addq $56, %rsp - popq %rbp + .cfi_def_cfa_offset 8 retq .Ltmp3: .size __xray_FunctionExit, .Ltmp3-__xray_FunctionExit .cfi_endproc +//===----------------------------------------------------------------------===// + .global __xray_FunctionTailExit .align 16, 0x90 .type __xray_FunctionTailExit,@function @@ -126,8 +139,6 @@ __xray_FunctionTailExit: // this is an exit. In the future, we will introduce a new entry type that // differentiates between a normal exit and a tail exit, but we'd have to do // this and increment the version number for the header. - pushq %rbp - .cfi_def_cfa_offset 16 SAVE_REGISTERS movq _ZN6__xray19XRayPatchedFunctionE(%rip), %rax @@ -140,8 +151,82 @@ __xray_FunctionTailExit: .Ltmp4: RESTORE_REGISTERS - popq %rbp retq .Ltmp5: .size __xray_FunctionTailExit, .Ltmp5-__xray_FunctionTailExit .cfi_endproc + +//===----------------------------------------------------------------------===// + + .globl __xray_ArgLoggerEntry + .align 16, 0x90 + .type __xray_ArgLoggerEntry,@function +__xray_ArgLoggerEntry: + .cfi_startproc + SAVE_REGISTERS + + // Again, these function pointer loads must be atomic; MOV is fine. + movq _ZN6__xray13XRayArgLoggerE(%rip), %rax + testq %rax, %rax + jne .Larg1entryLog + + // If [arg1 logging handler] not set, defer to no-arg logging. + movq _ZN6__xray19XRayPatchedFunctionE(%rip), %rax + testq %rax, %rax + je .Larg1entryFail + +.Larg1entryLog: + + // First argument will become the third + movq %rdi, %rdx + + // XRayEntryType::ENTRY into the second + xorq %rsi, %rsi + + // 32-bit function ID becomes the first + movl %r10d, %edi + callq *%rax + +.Larg1entryFail: + RESTORE_REGISTERS + retq + +.Larg1entryEnd: + .size __xray_ArgLoggerEntry, .Larg1entryEnd-__xray_ArgLoggerEntry + .cfi_endproc + +//===----------------------------------------------------------------------===// + + .global __xray_CustomEvent + .align 16, 0x90 + .type __xray_CustomEvent,@function +__xray_CustomEvent: + .cfi_startproc + subq $16, %rsp + .cfi_def_cfa_offset 24 + movq %rbp, 8(%rsp) + movq %rax, 0(%rsp) + + // We take two arguments to this trampoline, which should be in rdi and rsi + // already. We also make sure that we stash %rax because we use that register + // to call the logging handler. + movq _ZN6__xray22XRayPatchedCustomEventE(%rip), %rax + testq %rax,%rax + je .LcustomEventCleanup + + // At this point we know that rcx and rdx already has the data, so we just + // call the logging handler. + callq *%rax + +.LcustomEventCleanup: + movq 0(%rsp), %rax + movq 8(%rsp), %rbp + addq $16, %rsp + .cfi_def_cfa_offset 8 + retq + +.Ltmp8: + .size __xray_CustomEvent, .Ltmp8-__xray_CustomEvent + .cfi_endproc + +NO_EXEC_STACK_DIRECTIVE diff --git a/contrib/compiler-rt/lib/xray/xray_tsc.h b/contrib/compiler-rt/lib/xray/xray_tsc.h new file mode 100644 index 0000000..4507564 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_tsc.h @@ -0,0 +1,68 @@ +//===-- xray_tsc.h ----------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_EMULATE_TSC_H +#define XRAY_EMULATE_TSC_H + +namespace __xray { +static constexpr uint64_t NanosecondsPerSecond = 1000ULL * 1000 * 1000; +} + +#if defined(__x86_64__) +#include "xray_x86_64.inc" +#elif defined(__powerpc64__) +#include "xray_powerpc64.inc" +#elif defined(__arm__) || defined(__aarch64__) || defined(__mips__) +// Emulated TSC. +// There is no instruction like RDTSCP in user mode on ARM. ARM's CP15 does +// not have a constant frequency like TSC on x86(_64), it may go faster +// or slower depending on CPU turbo or power saving mode. Furthermore, +// to read from CP15 on ARM a kernel modification or a driver is needed. +// We can not require this from users of compiler-rt. +// So on ARM we use clock_gettime() which gives the result in nanoseconds. +// To get the measurements per second, we scale this by the number of +// nanoseconds per second, pretending that the TSC frequency is 1GHz and +// one TSC tick is 1 nanosecond. +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_internal_defs.h" +#include "xray_defs.h" +#include <cerrno> +#include <cstdint> +#include <time.h> + +namespace __xray { + +inline bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } + +ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT { + timespec TS; + int result = clock_gettime(CLOCK_REALTIME, &TS); + if (result != 0) { + Report("clock_gettime(2) returned %d, errno=%d.", result, int(errno)); + TS.tv_sec = 0; + TS.tv_nsec = 0; + } + CPU = 0; + return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec; +} + +inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { + return NanosecondsPerSecond; +} + +} // namespace __xray + +#else +#error Target architecture is not supported. +#endif // CPU architecture + +#endif // XRAY_EMULATE_TSC_H diff --git a/contrib/compiler-rt/lib/xray/xray_utils.cc b/contrib/compiler-rt/lib/xray/xray_utils.cc new file mode 100644 index 0000000..b9a38d1 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_utils.cc @@ -0,0 +1,125 @@ +//===-- xray_utils.cc -------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +//===----------------------------------------------------------------------===// +#include "xray_utils.h" + +#include "sanitizer_common/sanitizer_common.h" +#include "xray_defs.h" +#include "xray_flags.h" +#include <stdlib.h> +#include <cstdio> +#include <errno.h> +#include <fcntl.h> +#include <iterator> +#include <sys/types.h> +#include <tuple> +#include <unistd.h> +#include <utility> + +namespace __xray { + +void printToStdErr(const char *Buffer) XRAY_NEVER_INSTRUMENT { + fprintf(stderr, "%s", Buffer); +} + +void retryingWriteAll(int Fd, char *Begin, char *End) XRAY_NEVER_INSTRUMENT { + if (Begin == End) + return; + auto TotalBytes = std::distance(Begin, End); + while (auto Written = write(Fd, Begin, TotalBytes)) { + if (Written < 0) { + if (errno == EINTR) + continue; // Try again. + Report("Failed to write; errno = %d\n", errno); + return; + } + TotalBytes -= Written; + if (TotalBytes == 0) + break; + Begin += Written; + } +} + +std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin, + char *End) XRAY_NEVER_INSTRUMENT { + auto BytesToRead = std::distance(Begin, End); + ssize_t BytesRead; + ssize_t TotalBytesRead = 0; + while (BytesToRead && (BytesRead = read(Fd, Begin, BytesToRead))) { + if (BytesRead == -1) { + if (errno == EINTR) + continue; + Report("Read error; errno = %d\n", errno); + return std::make_pair(TotalBytesRead, false); + } + + TotalBytesRead += BytesRead; + BytesToRead -= BytesRead; + Begin += BytesRead; + } + return std::make_pair(TotalBytesRead, true); +} + +bool readValueFromFile(const char *Filename, + long long *Value) XRAY_NEVER_INSTRUMENT { + int Fd = open(Filename, O_RDONLY | O_CLOEXEC); + if (Fd == -1) + return false; + static constexpr size_t BufSize = 256; + char Line[BufSize] = {}; + ssize_t BytesRead; + bool Success; + std::tie(BytesRead, Success) = retryingReadSome(Fd, Line, Line + BufSize); + if (!Success) + return false; + close(Fd); + char *End = nullptr; + long long Tmp = internal_simple_strtoll(Line, &End, 10); + bool Result = false; + if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) { + *Value = Tmp; + Result = true; + } + return Result; +} + +int getLogFD() XRAY_NEVER_INSTRUMENT { + // Open a temporary file once for the log. + static char TmpFilename[256] = {}; + static char TmpWildcardPattern[] = "XXXXXX"; + auto Argv = GetArgv(); + const char *Progname = Argv[0] == nullptr ? "(unknown)" : Argv[0]; + const char *LastSlash = internal_strrchr(Progname, '/'); + + if (LastSlash != nullptr) + Progname = LastSlash + 1; + + const int HalfLength = sizeof(TmpFilename) / 2 - sizeof(TmpWildcardPattern); + int NeededLength = internal_snprintf( + TmpFilename, sizeof(TmpFilename), "%.*s%.*s.%s", HalfLength, + flags()->xray_logfile_base, HalfLength, Progname, TmpWildcardPattern); + if (NeededLength > int(sizeof(TmpFilename))) { + Report("XRay log file name too long (%d): %s\n", NeededLength, TmpFilename); + return -1; + } + int Fd = mkstemp(TmpFilename); + if (Fd == -1) { + Report("XRay: Failed opening temporary file '%s'; not logging events.\n", + TmpFilename); + return -1; + } + Report("XRay: Log file in '%s'\n", TmpFilename); + + return Fd; +} + +} // namespace __xray diff --git a/contrib/compiler-rt/lib/xray/xray_utils.h b/contrib/compiler-rt/lib/xray/xray_utils.h new file mode 100644 index 0000000..1ecc74a --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_utils.h @@ -0,0 +1,41 @@ +//===-- xray_utils.h --------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// Some shared utilities for the XRay runtime implementation. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_UTILS_H +#define XRAY_UTILS_H + +#include <sys/types.h> +#include <utility> + +namespace __xray { + +// Default implementation of the reporting interface for sanitizer errors. +void printToStdErr(const char *Buffer); + +// EINTR-safe write routine, provided a file descriptor and a character range. +void retryingWriteAll(int Fd, char *Begin, char *End); + +// Reads a long long value from a provided file. +bool readValueFromFile(const char *Filename, long long *Value); + +// EINTR-safe read routine, providing a file descriptor and a character range. +std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin, char *End); + +// EINTR-safe open routine, uses flag-provided values for initialising a log +// file. +int getLogFD(); + +} // namespace __xray + +#endif // XRAY_UTILS_H diff --git a/contrib/compiler-rt/lib/xray/xray_x86_64.cc b/contrib/compiler-rt/lib/xray/xray_x86_64.cc index 3ee9189..e34806f 100644 --- a/contrib/compiler-rt/lib/xray/xray_x86_64.cc +++ b/contrib/compiler-rt/lib/xray/xray_x86_64.cc @@ -1,6 +1,8 @@ +#include "cpuid.h" #include "sanitizer_common/sanitizer_common.h" #include "xray_defs.h" #include "xray_interface_internal.h" + #include <atomic> #include <cstdint> #include <errno.h> @@ -42,9 +44,9 @@ static bool readValueFromFile(const char *Filename, ssize_t BytesRead; bool Success; std::tie(BytesRead, Success) = retryingReadSome(Fd, Line, Line + BufSize); + close(Fd); if (!Success) return false; - close(Fd); char *End = nullptr; long long Tmp = internal_simple_strtoll(Line, &End, 10); bool Result = false; @@ -55,32 +57,35 @@ static bool readValueFromFile(const char *Filename, return Result; } -uint64_t cycleFrequency() XRAY_NEVER_INSTRUMENT { - long long CPUFrequency = -1; +uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { + long long TSCFrequency = -1; if (readValueFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", - &CPUFrequency)) { - CPUFrequency *= 1000; + &TSCFrequency)) { + TSCFrequency *= 1000; } else if (readValueFromFile( - "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", - &CPUFrequency)) { - CPUFrequency *= 1000; + "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", + &TSCFrequency)) { + TSCFrequency *= 1000; } else { Report("Unable to determine CPU frequency for TSC accounting.\n"); } - return CPUFrequency == -1 ? 0 : static_cast<uint64_t>(CPUFrequency); + return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency); } static constexpr uint8_t CallOpCode = 0xe8; static constexpr uint16_t MovR10Seq = 0xba41; static constexpr uint16_t Jmp9Seq = 0x09eb; +static constexpr uint16_t Jmp20Seq = 0x14eb; static constexpr uint8_t JmpOpCode = 0xe9; static constexpr uint8_t RetOpCode = 0xc3; +static constexpr uint16_t NopwSeq = 0x9066; static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()}; static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()}; bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: @@ -101,13 +106,12 @@ bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, // 4. Do an atomic write over the jmp instruction for the "mov r10d" // opcode and first operand. // - // Prerequisite is to compute the relative offset to the - // __xray_FunctionEntry function's address. - int64_t TrampolineOffset = reinterpret_cast<int64_t>(__xray_FunctionEntry) - + // Prerequisite is to compute the relative offset to the trampoline's address. + int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) - (static_cast<int64_t>(Sled.Address) + 11); if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) { Report("XRay Entry trampoline (%p) too far from sled (%p)\n", - __xray_FunctionEntry, reinterpret_cast<void *>(Sled.Address)); + Trampoline, reinterpret_cast<void *>(Sled.Address)); return false; } if (Enable) { @@ -199,4 +203,60 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, return true; } +bool patchCustomEvent(const bool Enable, const uint32_t FuncId, + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + // Here we do the dance of replacing the following sled: + // + // xray_sled_n: + // jmp +19 // 2 bytes + // ... + // + // With the following: + // + // nopw // 2 bytes* + // ... + // + // We need to do this in the following order: + // + // 1. Overwrite the 5-byte nop with the call (relative), where (relative) is + // the relative offset to the __xray_CustomEvent trampoline. + // 2. Do a two-byte atomic write over the 'jmp +24' to turn it into a 'nopw'. + // This allows us to "enable" this code once the changes have committed. + // + // The "unpatch" should just turn the 'nopw' back to a 'jmp +24'. + // + if (Enable) { + std::atomic_store_explicit( + reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq, + std::memory_order_release); + } else { + std::atomic_store_explicit( + reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq, + std::memory_order_release); + } + return false; +} + +// We determine whether the CPU we're running on has the correct features we +// need. In x86_64 this will be rdtscp support. +bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { + unsigned int EAX, EBX, ECX, EDX; + + // We check whether rdtscp support is enabled. According to the x86_64 manual, + // level should be set at 0x80000001, and we should have a look at bit 27 in + // EDX. That's 0x8000000 (or 1u << 26). + __get_cpuid(0x80000001, &EAX, &EBX, &ECX, &EDX); + if (!(EDX & (1u << 26))) { + Report("Missing rdtscp support.\n"); + return false; + } + // Also check whether we can determine the CPU frequency, since if we cannot, + // we should use the emulated TSC instead. + if (!getTSCFrequency()) { + Report("Unable to determine CPU frequency.\n"); + return false; + } + return true; +} + } // namespace __xray diff --git a/contrib/compiler-rt/lib/xray/xray_x86_64.h b/contrib/compiler-rt/lib/xray/xray_x86_64.inc index 52d2dea..4ad3f98 100644 --- a/contrib/compiler-rt/lib/xray/xray_x86_64.h +++ b/contrib/compiler-rt/lib/xray/xray_x86_64.inc @@ -1,4 +1,4 @@ -//===-- xray_x86_64.h -------------------------------------------*- C++ -*-===// +//===-- xray_x86_64.inc -----------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -10,8 +10,6 @@ // This file is a part of XRay, a dynamic runtime instrumentation system. // //===----------------------------------------------------------------------===// -#ifndef XRAY_X86_64_H -#define XRAY_X86_64_H #include <cstdint> #include <x86intrin.h> @@ -27,6 +25,9 @@ ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT { CPU = LongCPU; return TSC; } -} -#endif // XRAY_X86_64_H +uint64_t getTSCFrequency(); + +bool probeRequiredCPUFeatures(); + +} // namespace __xray |