11 files changed, 10073 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
new file mode 100644
index 0000000..e7ef9f9
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -0,0 +1,2036 @@
+//===-- AddressSanitizer.cpp - memory error detector ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+// Details of the algorithm:
+//  http://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <string>
+#include <system_error>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asan"
+
+static const uint64_t kDefaultShadowScale = 3;
+static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
+static const uint64_t kIOSShadowOffset32 = 1ULL << 30;
+static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
+static const uint64_t kSmallX86_64ShadowOffset = 0x7FFF8000;  // < 2G.
+static const uint64_t kLinuxKasan_ShadowOffset64 = 0xdffffc0000000000;
+static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
+static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000;
+static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
+static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
+static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
+static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
+
+static const size_t kMinStackMallocSize = 1 << 6;   // 64B
+static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
+static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
+static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
+
+static const char *const kAsanModuleCtorName = "asan.module_ctor";
+static const char *const kAsanModuleDtorName = "asan.module_dtor";
+static const uint64_t kAsanCtorAndDtorPriority = 1;
+static const char *const kAsanReportErrorTemplate = "__asan_report_";
+static const char *const kAsanRegisterGlobalsName = "__asan_register_globals";
+static const char *const kAsanUnregisterGlobalsName =
+    "__asan_unregister_globals";
+static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
+static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
+static const char *const kAsanInitName = "__asan_init_v5";
+static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
+static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";
+static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
+static const int kMaxAsanStackMallocSizeClass = 10;
+static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
+static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
+static const char *const kAsanGenPrefix = "__asan_gen_";
+static const char *const kSanCovGenPrefix = "__sancov_gen_";
+static const char *const kAsanPoisonStackMemoryName =
+    "__asan_poison_stack_memory";
+static const char *const kAsanUnpoisonStackMemoryName =
+    "__asan_unpoison_stack_memory";
+
+static const char *const kAsanOptionDetectUAR =
+    "__asan_option_detect_stack_use_after_return";
+
+static const char *const kAsanAllocaPoison = "__asan_alloca_poison";
+static const char *const kAsanAllocasUnpoison = "__asan_allocas_unpoison";
+
+// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+static const size_t kNumberOfAccessSizes = 5;
+
+static const unsigned kAllocaRzSize = 32;
+
+// Command-line flags.
+static cl::opt<bool> ClEnableKasan(
+    "asan-kernel", cl::desc("Enable KernelAddressSanitizer instrumentation"),
+    cl::Hidden, cl::init(false));
+
+// This flag may need to be replaced with -f[no-]asan-reads.
+static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
+                                       cl::desc("instrument read instructions"),
+                                       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInstrumentWrites(
+    "asan-instrument-writes", cl::desc("instrument write instructions"),
+    cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInstrumentAtomics(
+    "asan-instrument-atomics",
+    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+    cl::init(true));
+static cl::opt<bool> ClAlwaysSlowPath(
+    "asan-always-slow-path",
+    cl::desc("use instrumentation with slow path for all accesses"), cl::Hidden,
+    cl::init(false));
+// This flag limits the number of instructions to be instrumented
+// in any given BB. Normally, this should be set to unlimited (INT_MAX),
+// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary
+// set it to 10000.
+static cl::opt<int> ClMaxInsnsToInstrumentPerBB(
+    "asan-max-ins-per-bb", cl::init(10000),
+    cl::desc("maximal number of instructions to instrument in any given BB"),
+    cl::Hidden);
+// This flag may need to be replaced with -f[no]asan-stack.
+static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"),
+                             cl::Hidden, cl::init(true));
+static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
+                                      cl::desc("Check return-after-free"),
+                                      cl::Hidden, cl::init(true));
+// This flag may need to be replaced with -f[no]asan-globals.
+static cl::opt<bool> ClGlobals("asan-globals",
+                               cl::desc("Handle global objects"), cl::Hidden,
+                               cl::init(true));
+static cl::opt<bool> ClInitializers("asan-initialization-order",
+                                    cl::desc("Handle C++ initializer order"),
+                                    cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInvalidPointerPairs(
+    "asan-detect-invalid-pointer-pair",
+    cl::desc("Instrument <, <=, >, >=, - with pointer operands"), cl::Hidden,
+    cl::init(false));
+static cl::opt<unsigned> ClRealignStack(
+    "asan-realign-stack",
+    cl::desc("Realign stack to the value of this flag (power of two)"),
+    cl::Hidden, cl::init(32));
+static cl::opt<int> ClInstrumentationWithCallsThreshold(
+    "asan-instrumentation-with-call-threshold",
+    cl::desc(
+        "If the function being instrumented contains more than "
+        "this number of memory accesses, use callbacks instead of "
+        "inline checks (-1 means never use callbacks)."),
+    cl::Hidden, cl::init(7000));
+static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
+    "asan-memory-access-callback-prefix",
+    cl::desc("Prefix for memory access callbacks"), cl::Hidden,
+    cl::init("__asan_"));
+static cl::opt<bool> ClInstrumentAllocas("asan-instrument-allocas",
+                                         cl::desc("instrument dynamic allocas"),
+                                         cl::Hidden, cl::init(false));
+static cl::opt<bool> ClSkipPromotableAllocas(
+    "asan-skip-promotable-allocas",
+    cl::desc("Do not instrument promotable allocas"), cl::Hidden,
+    cl::init(true));
+
+// These flags allow to change the shadow mapping.
+// The shadow mapping looks like
+//    Shadow = (Mem >> scale) + (1 << offset_log)
+static cl::opt<int> ClMappingScale("asan-mapping-scale",
+                                   cl::desc("scale of asan shadow mapping"),
+                                   cl::Hidden, cl::init(0));
+
+// Optimization flags. Not user visible, used mostly for testing
+// and benchmarking the tool.
+static cl::opt<bool> ClOpt("asan-opt", cl::desc("Optimize instrumentation"),
+                           cl::Hidden, cl::init(true));
+static cl::opt<bool> ClOptSameTemp(
+    "asan-opt-same-temp", cl::desc("Instrument the same temp just once"),
+    cl::Hidden, cl::init(true));
+static cl::opt<bool> ClOptGlobals("asan-opt-globals",
+                                  cl::desc("Don't instrument scalar globals"),
+                                  cl::Hidden, cl::init(true));
+static cl::opt<bool> ClOptStack(
+    "asan-opt-stack", cl::desc("Don't instrument scalar stack variables"),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClCheckLifetime(
+    "asan-check-lifetime",
+    cl::desc("Use llvm.lifetime intrinsics to insert extra checks"), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool> ClDynamicAllocaStack(
+    "asan-stack-dynamic-alloca",
+    cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<uint32_t> ClForceExperiment(
+    "asan-force-experiment",
+    cl::desc("Force optimization experiment (for testing)"), cl::Hidden,
+    cl::init(0));
+
+// Debug flags.
+static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
+                            cl::init(0));
+static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"),
+                                 cl::Hidden, cl::init(0));
+static cl::opt<std::string> ClDebugFunc("asan-debug-func", cl::Hidden,
+                                        cl::desc("Debug func"));
+static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
+                               cl::Hidden, cl::init(-1));
+static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
+                               cl::Hidden, cl::init(-1));
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOptimizedAccessesToGlobalVar,
+          "Number of optimized accesses to global vars");
+STATISTIC(NumOptimizedAccessesToStackVar,
+          "Number of optimized accesses to stack vars");
+
+namespace {
+/// Frontend-provided metadata for source location.
+struct LocationMetadata {
+  StringRef Filename;
+  int LineNo;
+  int ColumnNo;
+
+  LocationMetadata() : Filename(), LineNo(0), ColumnNo(0) {}
+
+  bool empty() const { return Filename.empty(); }
+
+  void parse(MDNode *MDN) {
+    assert(MDN->getNumOperands() == 3);
+    MDString *DIFilename = cast<MDString>(MDN->getOperand(0));
+    Filename = DIFilename->getString();
+    LineNo =
+        mdconst::extract<ConstantInt>(MDN->getOperand(1))->getLimitedValue();
+    ColumnNo =
+        mdconst::extract<ConstantInt>(MDN->getOperand(2))->getLimitedValue();
+  }
+};
+
+/// Frontend-provided metadata for global variables.
+class GlobalsMetadata {
+ public:
+  struct Entry {
+    Entry() : SourceLoc(), Name(), IsDynInit(false), IsBlacklisted(false) {}
+    LocationMetadata SourceLoc;
+    StringRef Name;
+    bool IsDynInit;
+    bool IsBlacklisted;
+  };
+
+  GlobalsMetadata() : inited_(false) {}
+
+  void init(Module &M) {
+    assert(!inited_);
+    inited_ = true;
+    NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals");
+    if (!Globals) return;
+    for (auto MDN : Globals->operands()) {
+      // Metadata node contains the global and the fields of "Entry".
+      assert(MDN->getNumOperands() == 5);
+      auto *GV = mdconst::extract_or_null<GlobalVariable>(MDN->getOperand(0));
+      // The optimizer may optimize away a global entirely.
+      if (!GV) continue;
+      // We can already have an entry for GV if it was merged with another
+      // global.
+      Entry &E = Entries[GV];
+      if (auto *Loc = cast_or_null<MDNode>(MDN->getOperand(1)))
+        E.SourceLoc.parse(Loc);
+      if (auto *Name = cast_or_null<MDString>(MDN->getOperand(2)))
+        E.Name = Name->getString();
+      ConstantInt *IsDynInit =
+          mdconst::extract<ConstantInt>(MDN->getOperand(3));
+      E.IsDynInit |= IsDynInit->isOne();
+      ConstantInt *IsBlacklisted =
+          mdconst::extract<ConstantInt>(MDN->getOperand(4));
+      E.IsBlacklisted |= IsBlacklisted->isOne();
+    }
+  }
+
+  /// Returns metadata entry for a given global.
+  Entry get(GlobalVariable *G) const {
+    auto Pos = Entries.find(G);
+    return (Pos != Entries.end()) ? Pos->second : Entry();
+  }
+
+ private:
+  bool inited_;
+  DenseMap<GlobalVariable *, Entry> Entries;
+};
+
+/// This struct defines the shadow mapping using the rule:
+///   shadow = (mem >> Scale) ADD-or-OR Offset.
+struct ShadowMapping {
+  int Scale;
+  uint64_t Offset;
+  bool OrShadowOffset;
+};
+
+static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
+                                      bool IsKasan) {
+  bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android;
+  bool IsIOS = TargetTriple.isiOS();
+  bool IsFreeBSD = TargetTriple.isOSFreeBSD();
+  bool IsLinux = TargetTriple.isOSLinux();
+  bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 ||
+                 TargetTriple.getArch() == llvm::Triple::ppc64le;
+  bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
+  bool IsMIPS32 = TargetTriple.getArch() == llvm::Triple::mips ||
+                  TargetTriple.getArch() == llvm::Triple::mipsel;
+  bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 ||
+                  TargetTriple.getArch() == llvm::Triple::mips64el;
+  bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64;
+  bool IsWindows = TargetTriple.isOSWindows();
+
+  ShadowMapping Mapping;
+
+  if (LongSize == 32) {
+    if (IsAndroid)
+      Mapping.Offset = 0;
+    else if (IsMIPS32)
+      Mapping.Offset = kMIPS32_ShadowOffset32;
+    else if (IsFreeBSD)
+      Mapping.Offset = kFreeBSD_ShadowOffset32;
+    else if (IsIOS)
+      Mapping.Offset = kIOSShadowOffset32;
+    else if (IsWindows)
+      Mapping.Offset = kWindowsShadowOffset32;
+    else
+      Mapping.Offset = kDefaultShadowOffset32;
+  } else {  // LongSize == 64
+    if (IsPPC64)
+      Mapping.Offset = kPPC64_ShadowOffset64;
+    else if (IsFreeBSD)
+      Mapping.Offset = kFreeBSD_ShadowOffset64;
+    else if (IsLinux && IsX86_64) {
+      if (IsKasan)
+        Mapping.Offset = kLinuxKasan_ShadowOffset64;
+      else
+        Mapping.Offset = kSmallX86_64ShadowOffset;
+    } else if (IsMIPS64)
+      Mapping.Offset = kMIPS64_ShadowOffset64;
+    else if (IsAArch64)
+      Mapping.Offset = kAArch64_ShadowOffset64;
+    else
+      Mapping.Offset = kDefaultShadowOffset64;
+  }
+
+  Mapping.Scale = kDefaultShadowScale;
+  if (ClMappingScale) {
+    Mapping.Scale = ClMappingScale;
+  }
+
+  // OR-ing shadow offset if more efficient (at least on x86) if the offset
+  // is a power of two, but on ppc64 we have to use add since the shadow
+  // offset is not necessary 1/8-th of the address space.
+  Mapping.OrShadowOffset = !IsPPC64 && !(Mapping.Offset & (Mapping.Offset - 1));
+
+  return Mapping;
+}
+
+static size_t RedzoneSizeForScale(int MappingScale) {
+  // Redzone used for stack and globals is at least 32 bytes.
+  // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
+  return std::max(32U, 1U << MappingScale);
+}
+
+/// AddressSanitizer: instrument the code in module to find memory bugs.
+struct AddressSanitizer : public FunctionPass {
+  explicit AddressSanitizer(bool CompileKernel = false)
+      : FunctionPass(ID), CompileKernel(CompileKernel || ClEnableKasan) {
+    initializeAddressSanitizerPass(*PassRegistry::getPassRegistry());
+  }
+  const char *getPassName() const override {
+    return "AddressSanitizerFunctionPass";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+  }
+  uint64_t getAllocaSizeInBytes(AllocaInst *AI) const {
+    Type *Ty = AI->getAllocatedType();
+    uint64_t SizeInBytes =
+        AI->getModule()->getDataLayout().getTypeAllocSize(Ty);
+    return SizeInBytes;
+  }
+  /// Check if we want (and can) handle this alloca.
+  bool isInterestingAlloca(AllocaInst &AI);
+
+  // Check if we have dynamic alloca.
+  bool isDynamicAlloca(AllocaInst &AI) const {
+    return AI.isArrayAllocation() || !AI.isStaticAlloca();
+  }
+
+  /// If it is an interesting memory access, return the PointerOperand
+  /// and set IsWrite/Alignment. Otherwise return nullptr.
+  Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
+                                   uint64_t *TypeSize, unsigned *Alignment);
+  void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, Instruction *I,
+                     bool UseCalls, const DataLayout &DL);
+  void instrumentPointerComparisonOrSubtraction(Instruction *I);
+  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite,
+                         Value *SizeArgument, bool UseCalls, uint32_t Exp);
+  void instrumentUnusualSizeOrAlignment(Instruction *I, Value *Addr,
+                                        uint32_t TypeSize, bool IsWrite,
+                                        Value *SizeArgument, bool UseCalls,
+                                        uint32_t Exp);
+  Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                           Value *ShadowValue, uint32_t TypeSize);
+  Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
+                                 bool IsWrite, size_t AccessSizeIndex,
+                                 Value *SizeArgument, uint32_t Exp);
+  void instrumentMemIntrinsic(MemIntrinsic *MI);
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool runOnFunction(Function &F) override;
+  bool maybeInsertAsanInitAtFunctionEntry(Function &F);
+  bool doInitialization(Module &M) override;
+  static char ID;  // Pass identification, replacement for typeid
+
+  DominatorTree &getDominatorTree() const { return *DT; }
+
+ private:
+  void initializeCallbacks(Module &M);
+
+  bool LooksLikeCodeInBug11395(Instruction *I);
+  bool GlobalIsLinkerInitialized(GlobalVariable *G);
+  bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr,
+                    uint64_t TypeSize) const;
+
+  LLVMContext *C;
+  Triple TargetTriple;
+  int LongSize;
+  bool CompileKernel;
+  Type *IntptrTy;
+  ShadowMapping Mapping;
+  DominatorTree *DT;
+  Function *AsanCtorFunction = nullptr;
+  Function *AsanInitFunction = nullptr;
+  Function *AsanHandleNoReturnFunc;
+  Function *AsanPtrCmpFunction, *AsanPtrSubFunction;
+  // This array is indexed by AccessIsWrite, Experiment and log2(AccessSize).
+  Function *AsanErrorCallback[2][2][kNumberOfAccessSizes];
+  Function *AsanMemoryAccessCallback[2][2][kNumberOfAccessSizes];
+  // This array is indexed by AccessIsWrite and Experiment.
+  Function *AsanErrorCallbackSized[2][2];
+  Function *AsanMemoryAccessCallbackSized[2][2];
+  Function *AsanMemmove, *AsanMemcpy, *AsanMemset;
+  InlineAsm *EmptyAsm;
+  GlobalsMetadata GlobalsMD;
+  DenseMap<AllocaInst *, bool> ProcessedAllocas;
+
+  friend struct FunctionStackPoisoner;
+};
+
+class AddressSanitizerModule : public ModulePass {
+ public:
+  explicit AddressSanitizerModule(bool CompileKernel = false)
+      : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan) {}
+  bool runOnModule(Module &M) override;
+  static char ID;  // Pass identification, replacement for typeid
+  const char *getPassName() const override { return "AddressSanitizerModule"; }
+
+ private:
+  void initializeCallbacks(Module &M);
+
+  bool InstrumentGlobals(IRBuilder<> &IRB, Module &M);
+  bool ShouldInstrumentGlobal(GlobalVariable *G);
+  void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName);
+  void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
+  size_t MinRedzoneSizeForGlobal() const {
+    return RedzoneSizeForScale(Mapping.Scale);
+  }
+
+  GlobalsMetadata GlobalsMD;
+  bool CompileKernel;
+  Type *IntptrTy;
+  LLVMContext *C;
+  Triple TargetTriple;
+  ShadowMapping Mapping;
+  Function *AsanPoisonGlobals;
+  Function *AsanUnpoisonGlobals;
+  Function *AsanRegisterGlobals;
+  Function *AsanUnregisterGlobals;
+};
+
+// Stack poisoning does not play well with exception handling.
+// When an exception is thrown, we essentially bypass the code
+// that unpoisones the stack. This is why the run-time library has
+// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire
+// stack in the interceptor. This however does not work inside the
+// actual function which catches the exception. Most likely because the
+// compiler hoists the load of the shadow value somewhere too high.
+// This causes asan to report a non-existing bug on 453.povray.
+// It sounds like an LLVM bug.
+struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
+  Function &F;
+  AddressSanitizer &ASan;
+  DIBuilder DIB;
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *IntptrPtrTy;
+  ShadowMapping Mapping;
+
+  SmallVector<AllocaInst *, 16> AllocaVec;
+  SmallVector<Instruction *, 8> RetVec;
+  unsigned StackAlignment;
+
+  Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],
+      *AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
+  Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc;
+  Function *AsanAllocaPoisonFunc, *AsanAllocasUnpoisonFunc;
+
+  // Stores a place and arguments of poisoning/unpoisoning call for alloca.
+  struct AllocaPoisonCall {
+    IntrinsicInst *InsBefore;
+    AllocaInst *AI;
+    uint64_t Size;
+    bool DoPoison;
+  };
+  SmallVector<AllocaPoisonCall, 8> AllocaPoisonCallVec;
+
+  SmallVector<AllocaInst *, 1> DynamicAllocaVec;
+  SmallVector<IntrinsicInst *, 1> StackRestoreVec;
+  AllocaInst *DynamicAllocaLayout = nullptr;
+
+  // Maps Value to an AllocaInst from which the Value is originated.
+  typedef DenseMap<Value *, AllocaInst *> AllocaForValueMapTy;
+  AllocaForValueMapTy AllocaForValue;
+
+  bool HasNonEmptyInlineAsm;
+  std::unique_ptr<CallInst> EmptyInlineAsm;
+
+  FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
+      : F(F),
+        ASan(ASan),
+        DIB(*F.getParent(), /*AllowUnresolved*/ false),
+        C(ASan.C),
+        IntptrTy(ASan.IntptrTy),
+        IntptrPtrTy(PointerType::get(IntptrTy, 0)),
+        Mapping(ASan.Mapping),
+        StackAlignment(1 << Mapping.Scale),
+        HasNonEmptyInlineAsm(false),
+        EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {}
+
+  bool runOnFunction() {
+    if (!ClStack) return false;
+    // Collect alloca, ret, lifetime instructions etc.
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB);
+
+    if (AllocaVec.empty() && DynamicAllocaVec.empty()) return false;
+
+    initializeCallbacks(*F.getParent());
+
+    poisonStack();
+
+    if (ClDebugStack) {
+      DEBUG(dbgs() << F);
+    }
+    return true;
+  }
+
+  // Finds all Alloca instructions and puts
+  // poisoned red zones around all of them.
+  // Then unpoison everything back before the function returns.
+  void poisonStack();
+
+  void createDynamicAllocasInitStorage();
+
+  // ----------------------- Visitors.
+  /// \brief Collect all Ret instructions.
+  void visitReturnInst(ReturnInst &RI) { RetVec.push_back(&RI); }
+
+  void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
+                                        Value *SavedStack) {
+    IRBuilder<> IRB(InstBefore);
+    IRB.CreateCall(AsanAllocasUnpoisonFunc,
+                   {IRB.CreateLoad(DynamicAllocaLayout),
+                    IRB.CreatePtrToInt(SavedStack, IntptrTy)});
+  }
+
+  // Unpoison dynamic allocas redzones.
+  void unpoisonDynamicAllocas() {
+    for (auto &Ret : RetVec)
+      unpoisonDynamicAllocasBeforeInst(Ret, DynamicAllocaLayout);
+
+    for (auto &StackRestoreInst : StackRestoreVec)
+      unpoisonDynamicAllocasBeforeInst(StackRestoreInst,
+                                       StackRestoreInst->getOperand(0));
+  }
+
+  // Deploy and poison redzones around dynamic alloca call. To do this, we
+  // should replace this call with another one with changed parameters and
+  // replace all its uses with new address, so
+  //   addr = alloca type, old_size, align
+  // is replaced by
+  //   new_size = (old_size + additional_size) * sizeof(type)
+  //   tmp = alloca i8, new_size, max(align, 32)
+  //   addr = tmp + 32 (first 32 bytes are for the left redzone).
+  // Additional_size is added to make new memory allocation contain not only
+  // requested memory, but also left, partial and right redzones.
+  void handleDynamicAllocaCall(AllocaInst *AI);
+
+  /// \brief Collect Alloca instructions we want (and can) handle.
+  void visitAllocaInst(AllocaInst &AI) {
+    if (!ASan.isInterestingAlloca(AI)) return;
+
+    StackAlignment = std::max(StackAlignment, AI.getAlignment());
+    if (ASan.isDynamicAlloca(AI))
+      DynamicAllocaVec.push_back(&AI);
+    else
+      AllocaVec.push_back(&AI);
+  }
+
+  /// \brief Collect lifetime intrinsic calls to check for use-after-scope
+  /// errors.
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    Intrinsic::ID ID = II.getIntrinsicID();
+    if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II);
+    if (!ClCheckLifetime) return;
+    if (ID != Intrinsic::lifetime_start && ID != Intrinsic::lifetime_end)
+      return;
+    // Found lifetime intrinsic, add ASan instrumentation if necessary.
+    ConstantInt *Size = dyn_cast<ConstantInt>(II.getArgOperand(0));
+    // If size argument is undefined, don't do anything.
+    if (Size->isMinusOne()) return;
+    // Check that size doesn't saturate uint64_t and can
+    // be stored in IntptrTy.
+    const uint64_t SizeValue = Size->getValue().getLimitedValue();
+    if (SizeValue == ~0ULL ||
+        !ConstantInt::isValueValidForType(IntptrTy, SizeValue))
+      return;
+    // Find alloca instruction that corresponds to llvm.lifetime argument.
+    AllocaInst *AI = findAllocaForValue(II.getArgOperand(1));
+    if (!AI) return;
+    bool DoPoison = (ID == Intrinsic::lifetime_end);
+    AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
+    AllocaPoisonCallVec.push_back(APC);
+  }
+
+  void visitCallInst(CallInst &CI) {
+    HasNonEmptyInlineAsm |=
+        CI.isInlineAsm() && !CI.isIdenticalTo(EmptyInlineAsm.get());
+  }
+
+  // ---------------------- Helpers.
+  void initializeCallbacks(Module &M);
+
+  bool doesDominateAllExits(const Instruction *I) const {
+    for (auto Ret : RetVec) {
+      if (!ASan.getDominatorTree().dominates(I, Ret)) return false;
+    }
+    return true;
+  }
+
+  /// Finds alloca where the value comes from.
+  AllocaInst *findAllocaForValue(Value *V);
+  void poisonRedZones(ArrayRef<uint8_t> ShadowBytes, IRBuilder<> &IRB,
+                      Value *ShadowBase, bool DoPoison);
+  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison);
+
+  void SetShadowToStackAfterReturnInlined(IRBuilder<> &IRB, Value *ShadowBase,
+                                          int Size);
+  Value *createAllocaForLayout(IRBuilder<> &IRB, const ASanStackFrameLayout &L,
+                               bool Dynamic);
+  PHINode *createPHI(IRBuilder<> &IRB, Value *Cond, Value *ValueIfTrue,
+                     Instruction *ThenTerm, Value *ValueIfFalse);
+};
+
+}  // namespace
+
+char AddressSanitizer::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    AddressSanitizer, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(
+    AddressSanitizer, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
+    false)
+FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel) {
+  return new AddressSanitizer(CompileKernel);
+}
+
+char AddressSanitizerModule::ID = 0;
+INITIALIZE_PASS(
+    AddressSanitizerModule, "asan-module",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
+    "ModulePass",
+    false, false)
+ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel) {
+  return new AddressSanitizerModule(CompileKernel);
+}
+
+static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
+  size_t Res = countTrailingZeros(TypeSize / 8);
+  assert(Res < kNumberOfAccessSizes);
+  return Res;
+}
+
+// \brief Create a constant for Str so that we can pass it to the run-time lib.
+static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
+                                                    bool AllowMerging) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  // We use private linkage for module-local strings. If they can be merged
+  // with another one, we set the unnamed_addr attribute.
+  GlobalVariable *GV =
+      new GlobalVariable(M, StrConst->getType(), true,
+                         GlobalValue::PrivateLinkage, StrConst, kAsanGenPrefix);
+  if (AllowMerging) GV->setUnnamedAddr(true);
+  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
+  return GV;
+}
+
+/// \brief Create a global describing a source location.
+static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
+                                                       LocationMetadata MD) {
+  Constant *LocData[] = {
+      createPrivateGlobalForString(M, MD.Filename, true),
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo),
+      ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo),
+  };
+  auto LocStruct = ConstantStruct::getAnon(LocData);
+  auto GV = new GlobalVariable(M, LocStruct->getType(), true,
+                               GlobalValue::PrivateLinkage, LocStruct,
+                               kAsanGenPrefix);
+  GV->setUnnamedAddr(true);
+  return GV;
+}
+
+static bool GlobalWasGeneratedByAsan(GlobalVariable *G) {
+  return G->getName().find(kAsanGenPrefix) == 0 ||
+         G->getName().find(kSanCovGenPrefix) == 0;
+}
+
+Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // Shadow >> scale
+  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+  if (Mapping.Offset == 0) return Shadow;
+  // (Shadow >> scale) | offset
+  if (Mapping.OrShadowOffset)
+    return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
+  else
+    return IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
+}
+
+// Instrument memset/memmove/memcpy
+void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall(
+        isa<MemMoveInst>(MI) ? AsanMemmove : AsanMemcpy,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  } else if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall(
+        AsanMemset,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  }
+  MI->eraseFromParent();
+}
+
+/// Check if we want (and can) handle this alloca.
+bool AddressSanitizer::isInterestingAlloca(AllocaInst &AI) {
+  auto PreviouslySeenAllocaInfo = ProcessedAllocas.find(&AI);
+
+  if (PreviouslySeenAllocaInfo != ProcessedAllocas.end())
+    return PreviouslySeenAllocaInfo->getSecond();
+
+  bool IsInteresting =
+      (AI.getAllocatedType()->isSized() &&
+       // alloca() may be called with 0 size, ignore it.
+       getAllocaSizeInBytes(&AI) > 0 &&
+       // We are only interested in allocas not promotable to registers.
+       // Promotable allocas are common under -O0.
+       (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI) ||
+        isDynamicAlloca(AI)));
+
+  ProcessedAllocas[&AI] = IsInteresting;
+  return IsInteresting;
+}
+
+/// If I is an interesting memory access, return the PointerOperand
+/// and set IsWrite/Alignment. Otherwise return nullptr.
+Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
+                                                   bool *IsWrite,
+                                                   uint64_t *TypeSize,
+                                                   unsigned *Alignment) {
+  // Skip memory accesses inserted by another instrumentation.
+  if (I->getMetadata("nosanitize")) return nullptr;
+
+  Value *PtrOperand = nullptr;
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads) return nullptr;
+    *IsWrite = false;
+    *TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
+    *Alignment = LI->getAlignment();
+    PtrOperand = LI->getPointerOperand();
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites) return nullptr;
+    *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
+    *Alignment = SI->getAlignment();
+    PtrOperand = SI->getPointerOperand();
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics) return nullptr;
+    *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
+    *Alignment = 0;
+    PtrOperand = RMW->getPointerOperand();
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics) return nullptr;
+    *IsWrite = true;
+    *TypeSize = DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
+    *Alignment = 0;
+    PtrOperand = XCHG->getPointerOperand();
+  }
+
+  // Treat memory accesses to promotable allocas as non-interesting since they
+  // will not cause memory violations. This greatly speeds up the instrumented
+  // executable at -O0.
+  if (ClSkipPromotableAllocas)
+    if (auto AI = dyn_cast_or_null<AllocaInst>(PtrOperand))
+      return isInterestingAlloca(*AI) ? AI : nullptr;
+
+  return PtrOperand;
+}
+
+static bool isPointerOperand(Value *V) {
+  return V->getType()->isPointerTy() || isa<PtrToIntInst>(V);
+}
+
+// This is a rough heuristic; it may cause both false positives and
+// false negatives. The proper implementation requires cooperation with
+// the frontend.
+static bool isInterestingPointerComparisonOrSubtraction(Instruction *I) {
+  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(I)) {
+    if (!Cmp->isRelational()) return false;
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+    if (BO->getOpcode() != Instruction::Sub) return false;
+  } else {
+    return false;
+  }
+  if (!isPointerOperand(I->getOperand(0)) ||
+      !isPointerOperand(I->getOperand(1)))
+    return false;
+  return true;
+}
+
+bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
+  // If a global variable does not have dynamic initialization we don't
+  // have to instrument it.  However, if a global does not have initializer
+  // at all, we assume it has dynamic initializer (in other TU).
+  return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit;
+}
+
+void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
+    Instruction *I) {
+  IRBuilder<> IRB(I);
+  Function *F = isa<ICmpInst>(I) ? AsanPtrCmpFunction : AsanPtrSubFunction;
+  Value *Param[2] = {I->getOperand(0), I->getOperand(1)};
+  for (int i = 0; i < 2; i++) {
+    if (Param[i]->getType()->isPointerTy())
+      Param[i] = IRB.CreatePointerCast(Param[i], IntptrTy);
+  }
+  IRB.CreateCall(F, Param);
+}
+
+void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
+                                     Instruction *I, bool UseCalls,
+                                     const DataLayout &DL) {
+  bool IsWrite = false;
+  unsigned Alignment = 0;
+  uint64_t TypeSize = 0;
+  Value *Addr = isInterestingMemoryAccess(I, &IsWrite, &TypeSize, &Alignment);
+  assert(Addr);
+
+  // Optimization experiments.
+  // The experiments can be used to evaluate potential optimizations that remove
+  // instrumentation (assess false negatives). Instead of completely removing
+  // some instrumentation, you set Exp to a non-zero value (mask of optimization
+  // experiments that want to remove instrumentation of this instruction).
+  // If Exp is non-zero, this pass will emit special calls into runtime
+  // (e.g. __asan_report_exp_load1 instead of __asan_report_load1). These calls
+  // make runtime terminate the program in a special way (with a different
+  // exit status). Then you run the new compiler on a buggy corpus, collect
+  // the special terminations (ideally, you don't see them at all -- no false
+  // negatives) and make the decision on the optimization.
+  uint32_t Exp = ClForceExperiment;
+
+  if (ClOpt && ClOptGlobals) {
+    // If initialization order checking is disabled, a simple access to a
+    // dynamically initialized global is always valid.
+    GlobalVariable *G = dyn_cast<GlobalVariable>(GetUnderlyingObject(Addr, DL));
+    if (G != NULL && (!ClInitializers || GlobalIsLinkerInitialized(G)) &&
+        isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
+      NumOptimizedAccessesToGlobalVar++;
+      return;
+    }
+  }
+
+  if (ClOpt && ClOptStack) {
+    // A direct inbounds access to a stack variable is always valid.
+    if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) &&
+        isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
+      NumOptimizedAccessesToStackVar++;
+      return;
+    }
+  }
+
+  if (IsWrite)
+    NumInstrumentedWrites++;
+  else
+    NumInstrumentedReads++;
+
+  unsigned Granularity = 1 << Mapping.Scale;
+  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
+  // if the data is properly aligned.
+  if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
+       TypeSize == 128) &&
+      (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8))
+    return instrumentAddress(I, I, Addr, TypeSize, IsWrite, nullptr, UseCalls,
+                             Exp);
+  instrumentUnusualSizeOrAlignment(I, Addr, TypeSize, IsWrite, nullptr,
+                                   UseCalls, Exp);
+}
+
+Instruction *AddressSanitizer::generateCrashCode(Instruction *InsertBefore,
+                                                 Value *Addr, bool IsWrite,
+                                                 size_t AccessSizeIndex,
+                                                 Value *SizeArgument,
+                                                 uint32_t Exp) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *ExpVal = Exp == 0 ? nullptr : ConstantInt::get(IRB.getInt32Ty(), Exp);
+  CallInst *Call = nullptr;
+  if (SizeArgument) {
+    if (Exp == 0)
+      Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][0],
+                            {Addr, SizeArgument});
+    else
+      Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][1],
+                            {Addr, SizeArgument, ExpVal});
+  } else {
+    if (Exp == 0)
+      Call =
+          IRB.CreateCall(AsanErrorCallback[IsWrite][0][AccessSizeIndex], Addr);
+    else
+      Call = IRB.CreateCall(AsanErrorCallback[IsWrite][1][AccessSizeIndex],
+                            {Addr, ExpVal});
+  }
+
+  // We don't do Call->setDoesNotReturn() because the BB already has
+  // UnreachableInst at the end.
+  // This EmptyAsm is required to avoid callback merge.
+  IRB.CreateCall(EmptyAsm, {});
+  return Call;
+}
+
+Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                                           Value *ShadowValue,
+                                           uint32_t TypeSize) {
+  size_t Granularity = 1 << Mapping.Scale;
+  // Addr & (Granularity - 1)
+  Value *LastAccessedByte =
+      IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+  // (Addr & (Granularity - 1)) + size - 1
+  if (TypeSize / 8 > 1)
+    LastAccessedByte = IRB.CreateAdd(
+        LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
+  // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+  LastAccessedByte =
+      IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false);
+  // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+  return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
+}
+
+void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
+                                         Instruction *InsertBefore, Value *Addr,
+                                         uint32_t TypeSize, bool IsWrite,
+                                         Value *SizeArgument, bool UseCalls,
+                                         uint32_t Exp) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+
+  if (UseCalls) {
+    if (Exp == 0)
+      IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex],
+                     AddrLong);
+    else
+      IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][1][AccessSizeIndex],
+                     {AddrLong, ConstantInt::get(IRB.getInt32Ty(), Exp)});
+    return;
+  }
+
+  Type *ShadowTy =
+      IntegerType::get(*C, std::max(8U, TypeSize >> Mapping.Scale));
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *ShadowPtr = memToShadow(AddrLong, IRB);
+  Value *CmpVal = Constant::getNullValue(ShadowTy);
+  Value *ShadowValue =
+      IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+
+  Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
+  size_t Granularity = 1 << Mapping.Scale;
+  TerminatorInst *CrashTerm = nullptr;
+
+  if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
+    // We use branch weights for the slow path check, to indicate that the slow
+    // path is rarely taken. This seems to be the case for SPEC benchmarks.
+    TerminatorInst *CheckTerm = SplitBlockAndInsertIfThen(
+        Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
+    assert(cast<BranchInst>(CheckTerm)->isUnconditional());
+    BasicBlock *NextBB = CheckTerm->getSuccessor(0);
+    IRB.SetInsertPoint(CheckTerm);
+    Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize);
+    BasicBlock *CrashBlock =
+        BasicBlock::Create(*C, "", NextBB->getParent(), NextBB);
+    CrashTerm = new UnreachableInst(*C, CrashBlock);
+    BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
+    ReplaceInstWithInst(CheckTerm, NewTerm);
+  } else {
+    CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, true);
+  }
+
+  Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite,
+                                         AccessSizeIndex, SizeArgument, Exp);
+  Crash->setDebugLoc(OrigIns->getDebugLoc());
+}
+
+// Instrument unusual size or unusual alignment.
+// We can not do it with a single check, so we do 1-byte check for the first
+// and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
+// to report the actual access size.
+void AddressSanitizer::instrumentUnusualSizeOrAlignment(
+    Instruction *I, Value *Addr, uint32_t TypeSize, bool IsWrite,
+    Value *SizeArgument, bool UseCalls, uint32_t Exp) {
+  IRBuilder<> IRB(I);
+  Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  if (UseCalls) {
+    if (Exp == 0)
+      IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][0],
+                     {AddrLong, Size});
+    else
+      IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][1],
+                     {AddrLong, Size, ConstantInt::get(IRB.getInt32Ty(), Exp)});
+  } else {
+    Value *LastByte = IRB.CreateIntToPtr(
+        IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
+        Addr->getType());
+    instrumentAddress(I, I, Addr, 8, IsWrite, Size, false, Exp);
+    instrumentAddress(I, I, LastByte, 8, IsWrite, Size, false, Exp);
+  }
+}
+
+void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit,
+                                                  GlobalValue *ModuleName) {
+  // Set up the arguments to our poison/unpoison functions.
+  IRBuilder<> IRB(GlobalInit.begin()->getFirstInsertionPt());
+
+  // Add a call to poison all external globals before the given function starts.
+  Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
+  IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);
+
+  // Add calls to unpoison all globals before each return instruction.
+  for (auto &BB : GlobalInit.getBasicBlockList())
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+      CallInst::Create(AsanUnpoisonGlobals, "", RI);
+}
+
+void AddressSanitizerModule::createInitializerPoisonCalls(
+    Module &M, GlobalValue *ModuleName) {
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  for (Use &OP : CA->operands()) {
+    if (isa<ConstantAggregateZero>(OP)) continue;
+    ConstantStruct *CS = cast<ConstantStruct>(OP);
+
+    // Must have a function or null ptr.
+    if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
+      if (F->getName() == kAsanModuleCtorName) continue;
+      ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
+      // Don't instrument CTORs that will run before asan.module_ctor.
+      if (Priority->getLimitedValue() <= kAsanCtorAndDtorPriority) continue;
+      poisonOneInitializer(*F, ModuleName);
+    }
+  }
+}
+
+bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
+  Type *Ty = cast<PointerType>(G->getType())->getElementType();
+  DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
+
+  if (GlobalsMD.get(G).IsBlacklisted) return false;
+  if (!Ty->isSized()) return false;
+  if (!G->hasInitializer()) return false;
+  if (GlobalWasGeneratedByAsan(G)) return false;  // Our own global.
+  // Touch only those globals that will not be defined in other modules.
+  // Don't handle ODR linkage types and COMDATs since other modules may be built
+  // without ASan.
+  if (G->getLinkage() != GlobalVariable::ExternalLinkage &&
+      G->getLinkage() != GlobalVariable::PrivateLinkage &&
+      G->getLinkage() != GlobalVariable::InternalLinkage)
+    return false;
+  if (G->hasComdat()) return false;
+  // Two problems with thread-locals:
+  //   - The address of the main thread's copy can't be computed at link-time.
+  //   - Need to poison all copies, not just the main thread's one.
+  if (G->isThreadLocal()) return false;
+  // For now, just ignore this Global if the alignment is large.
+  if (G->getAlignment() > MinRedzoneSizeForGlobal()) return false;
+
+  if (G->hasSection()) {
+    StringRef Section(G->getSection());
+
+    // Globals from llvm.metadata aren't emitted, do not instrument them.
+    if (Section == "llvm.metadata") return false;
+    // Do not instrument globals from special LLVM sections.
+    if (Section.find("__llvm") != StringRef::npos) return false;
+
+    // Callbacks put into the CRT initializer/terminator sections
+    // should not be instrumented.
+    // See https://code.google.com/p/address-sanitizer/issues/detail?id=305
+    // and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx
+    if (Section.startswith(".CRT")) {
+      DEBUG(dbgs() << "Ignoring a global initializer callback: " << *G << "\n");
+      return false;
+    }
+
+    if (TargetTriple.isOSBinFormatMachO()) {
+      StringRef ParsedSegment, ParsedSection;
+      unsigned TAA = 0, StubSize = 0;
+      bool TAAParsed;
+      std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier(
+          Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize);
+      if (!ErrorCode.empty()) {
+        assert(false && "Invalid section specifier.");
+        return false;
+      }
+
+      // Ignore the globals from the __OBJC section. The ObjC runtime assumes
+      // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
+      // them.
+      if (ParsedSegment == "__OBJC" ||
+          (ParsedSegment == "__DATA" && ParsedSection.startswith("__objc_"))) {
+        DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
+        return false;
+      }
+      // See http://code.google.com/p/address-sanitizer/issues/detail?id=32
+      // Constant CFString instances are compiled in the following way:
+      //  -- the string buffer is emitted into
+      //     __TEXT,__cstring,cstring_literals
+      //  -- the constant NSConstantString structure referencing that buffer
+      //     is placed into __DATA,__cfstring
+      // Therefore there's no point in placing redzones into __DATA,__cfstring.
+      // Moreover, it causes the linker to crash on OS X 10.7
+      if (ParsedSegment == "__DATA" && ParsedSection == "__cfstring") {
+        DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
+        return false;
+      }
+      // The linker merges the contents of cstring_literals and removes the
+      // trailing zeroes.
+      if (ParsedSegment == "__TEXT" && (TAA & MachO::S_CSTRING_LITERALS)) {
+        DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+void AddressSanitizerModule::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  // Declare our poisoning and unpoisoning functions.
+  AsanPoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
+  AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
+  AsanUnpoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      kAsanUnpoisonGlobalsName, IRB.getVoidTy(), nullptr));
+  AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);
+  // Declare functions that register/unregister globals.
+  AsanRegisterGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+  AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
+  AsanUnregisterGlobals = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kAsanUnregisterGlobalsName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, nullptr));
+  AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
+}
+
+// This function replaces all global variables with new variables that have
+// trailing redzones. It also creates a function that poisons
+// redzones and inserts this function into llvm.global_ctors.
+bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
+  GlobalsMD.init(M);
+
+  SmallVector<GlobalVariable *, 16> GlobalsToChange;
+
+  for (auto &G : M.globals()) {
+    if (ShouldInstrumentGlobal(&G)) GlobalsToChange.push_back(&G);
+  }
+
+  size_t n = GlobalsToChange.size();
+  if (n == 0) return false;
+
+  // A global is described by a structure
+  //   size_t beg;
+  //   size_t size;
+  //   size_t size_with_redzone;
+  //   const char *name;
+  //   const char *module_name;
+  //   size_t has_dynamic_init;
+  //   void *source_location;
+  // We initialize an array of such structures and pass it to a run-time call.
+  StructType *GlobalStructTy =
+      StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
+                      IntptrTy, IntptrTy, nullptr);
+  SmallVector<Constant *, 16> Initializers(n);
+
+  bool HasDynamicallyInitializedGlobals = false;
+
+  // We shouldn't merge same module names, as this string serves as unique
+  // module ID in runtime.
+  GlobalVariable *ModuleName = createPrivateGlobalForString(
+      M, M.getModuleIdentifier(), /*AllowMerging*/ false);
+
+  auto &DL = M.getDataLayout();
+  for (size_t i = 0; i < n; i++) {
+    static const uint64_t kMaxGlobalRedzone = 1 << 18;
+    GlobalVariable *G = GlobalsToChange[i];
+
+    auto MD = GlobalsMD.get(G);
+    // Create string holding the global name (use global name from metadata
+    // if it's available, otherwise just write the name of global variable).
+    GlobalVariable *Name = createPrivateGlobalForString(
+        M, MD.Name.empty() ? G->getName() : MD.Name,
+        /*AllowMerging*/ true);
+
+    PointerType *PtrTy = cast<PointerType>(G->getType());
+    Type *Ty = PtrTy->getElementType();
+    uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
+    uint64_t MinRZ = MinRedzoneSizeForGlobal();
+    // MinRZ <= RZ <= kMaxGlobalRedzone
+    // and trying to make RZ to be ~ 1/4 of SizeInBytes.
+    uint64_t RZ = std::max(
+        MinRZ, std::min(kMaxGlobalRedzone, (SizeInBytes / MinRZ / 4) * MinRZ));
+    uint64_t RightRedzoneSize = RZ;
+    // Round up to MinRZ
+    if (SizeInBytes % MinRZ) RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ);
+    assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
+    Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
+
+    StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr);
+    Constant *NewInitializer =
+        ConstantStruct::get(NewTy, G->getInitializer(),
+                            Constant::getNullValue(RightRedZoneTy), nullptr);
+
+    // Create a new global variable with enough space for a redzone.
+    GlobalValue::LinkageTypes Linkage = G->getLinkage();
+    if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage)
+      Linkage = GlobalValue::InternalLinkage;
+    GlobalVariable *NewGlobal =
+        new GlobalVariable(M, NewTy, G->isConstant(), Linkage, NewInitializer,
+                           "", G, G->getThreadLocalMode());
+    NewGlobal->copyAttributesFrom(G);
+    NewGlobal->setAlignment(MinRZ);
+
+    Value *Indices2[2];
+    Indices2[0] = IRB.getInt32(0);
+    Indices2[1] = IRB.getInt32(0);
+
+    G->replaceAllUsesWith(
+        ConstantExpr::getGetElementPtr(NewTy, NewGlobal, Indices2, true));
+    NewGlobal->takeName(G);
+    G->eraseFromParent();
+
+    Constant *SourceLoc;
+    if (!MD.SourceLoc.empty()) {
+      auto SourceLocGlobal = createPrivateGlobalForSourceLoc(M, MD.SourceLoc);
+      SourceLoc = ConstantExpr::getPointerCast(SourceLocGlobal, IntptrTy);
+    } else {
+      SourceLoc = ConstantInt::get(IntptrTy, 0);
+    }
+
+    Initializers[i] = ConstantStruct::get(
+        GlobalStructTy, ConstantExpr::getPointerCast(NewGlobal, IntptrTy),
+        ConstantInt::get(IntptrTy, SizeInBytes),
+        ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
+        ConstantExpr::getPointerCast(Name, IntptrTy),
+        ConstantExpr::getPointerCast(ModuleName, IntptrTy),
+        ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc, nullptr);
+
+    if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
+
+    DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
+  }
+
+  ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n);
+  GlobalVariable *AllGlobals = new GlobalVariable(
+      M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
+      ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
+
+  // Create calls for poisoning before initializers run and unpoisoning after.
+  if (HasDynamicallyInitializedGlobals)
+    createInitializerPoisonCalls(M, ModuleName);
+  IRB.CreateCall(AsanRegisterGlobals,
+                 {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                  ConstantInt::get(IntptrTy, n)});
+
+  // We also need to unregister globals at the end, e.g. when a shared library
+  // gets closed.
+  Function *AsanDtorFunction =
+      Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
+                       GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
+  BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
+  IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB));
+  IRB_Dtor.CreateCall(AsanUnregisterGlobals,
+                      {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                       ConstantInt::get(IntptrTy, n)});
+  appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority);
+
+  DEBUG(dbgs() << M);
+  return true;
+}
+
+bool AddressSanitizerModule::runOnModule(Module &M) {
+  C = &(M.getContext());
+  int LongSize = M.getDataLayout().getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  TargetTriple = Triple(M.getTargetTriple());
+  Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
+  initializeCallbacks(M);
+
+  bool Changed = false;
+
+  // TODO(glider): temporarily disabled globals instrumentation for KASan.
+  if (ClGlobals && !CompileKernel) {
+    Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
+    assert(CtorFunc);
+    IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
+    Changed |= InstrumentGlobals(IRB, M);
+  }
+
+  return Changed;
+}
+
+void AddressSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  // Create __asan_report* callbacks.
+  // IsWrite, TypeSize and Exp are encoded in the function name.
+  for (int Exp = 0; Exp < 2; Exp++) {
+    for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+      const std::string TypeStr = AccessIsWrite ? "store" : "load";
+      const std::string ExpStr = Exp ? "exp_" : "";
+      const std::string SuffixStr = CompileKernel ? "N" : "_n";
+      const std::string EndingStr = CompileKernel ? "_noabort" : "";
+      const Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr;
+      // TODO(glider): for KASan builds add _noabort to error reporting
+      // functions and make them actually noabort (remove the UnreachableInst).
+      AsanErrorCallbackSized[AccessIsWrite][Exp] =
+          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+              kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr,
+              IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));
+      AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
+          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+              ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
+              IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));
+      for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+           AccessSizeIndex++) {
+        const std::string Suffix = TypeStr + itostr(1 << AccessSizeIndex);
+        AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+            checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+                kAsanReportErrorTemplate + ExpStr + Suffix,
+                IRB.getVoidTy(), IntptrTy, ExpType, nullptr));
+        AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+            checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+                ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
+                IRB.getVoidTy(), IntptrTy, ExpType, nullptr));
+      }
+    }
+  }
+
+  const std::string MemIntrinCallbackPrefix =
+      CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
+  AsanMemmove = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      MemIntrinCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, nullptr));
+  AsanMemcpy = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      MemIntrinCallbackPrefix + "memcpy", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, nullptr));
+  AsanMemset = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      MemIntrinCallbackPrefix + "memset", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy, nullptr));
+
+  AsanHandleNoReturnFunc = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy(), nullptr));
+
+  AsanPtrCmpFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+  AsanPtrSubFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+  // We insert an empty inline asm after __asan_report* to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+}
+
+// virtual
+bool AddressSanitizer::doInitialization(Module &M) {
+  // Initialize the private fields. No one has accessed them before.
+
+  GlobalsMD.init(M);
+
+  C = &(M.getContext());
+  LongSize = M.getDataLayout().getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  TargetTriple = Triple(M.getTargetTriple());
+
+  if (!CompileKernel) {
+    std::tie(AsanCtorFunction, AsanInitFunction) =
+        createSanitizerCtorAndInitFunctions(M, kAsanModuleCtorName, kAsanInitName,
+                                            /*InitArgTypes=*/{},
+                                            /*InitArgs=*/{});
+    appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
+  }
+  Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
+  return true;
+}
+
+bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
+  // For each NSObject descendant having a +load method, this method is invoked
+  // by the ObjC runtime before any of the static constructors is called.
+  // Therefore we need to instrument such methods with a call to __asan_init
+  // at the beginning in order to initialize our runtime before any access to
+  // the shadow memory.
+  // We cannot just ignore these methods, because they may call other
+  // instrumented functions.
+  if (F.getName().find(" load]") != std::string::npos) {
+    IRBuilder<> IRB(F.begin()->begin());
+    IRB.CreateCall(AsanInitFunction, {});
+    return true;
+  }
+  return false;
+}
+
+bool AddressSanitizer::runOnFunction(Function &F) {
+  if (&F == AsanCtorFunction) return false;
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
+  DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
+  initializeCallbacks(*F.getParent());
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  // If needed, insert __asan_init before checking for SanitizeAddress attr.
+  maybeInsertAsanInitAtFunctionEntry(F);
+
+  if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return false;
+
+  if (!ClDebugFunc.empty() && ClDebugFunc != F.getName()) return false;
+
+  // We want to instrument every address only once per basic block (unless there
+  // are calls between uses).
+  SmallSet<Value *, 16> TempsToInstrument;
+  SmallVector<Instruction *, 16> ToInstrument;
+  SmallVector<Instruction *, 8> NoReturnCalls;
+  SmallVector<BasicBlock *, 16> AllBlocks;
+  SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts;
+  int NumAllocas = 0;
+  bool IsWrite;
+  unsigned Alignment;
+  uint64_t TypeSize;
+
+  // Fill the set of memory operations to instrument.
+  for (auto &BB : F) {
+    AllBlocks.push_back(&BB);
+    TempsToInstrument.clear();
+    int NumInsnsPerBB = 0;
+    for (auto &Inst : BB) {
+      if (LooksLikeCodeInBug11395(&Inst)) return false;
+      if (Value *Addr = isInterestingMemoryAccess(&Inst, &IsWrite, &TypeSize,
+                                                  &Alignment)) {
+        if (ClOpt && ClOptSameTemp) {
+          if (!TempsToInstrument.insert(Addr).second)
+            continue;  // We've seen this temp in the current BB.
+        }
+      } else if (ClInvalidPointerPairs &&
+                 isInterestingPointerComparisonOrSubtraction(&Inst)) {
+        PointerComparisonsOrSubtracts.push_back(&Inst);
+        continue;
+      } else if (isa<MemIntrinsic>(Inst)) {
+        // ok, take it.
+      } else {
+        if (isa<AllocaInst>(Inst)) NumAllocas++;
+        CallSite CS(&Inst);
+        if (CS) {
+          // A call inside BB.
+          TempsToInstrument.clear();
+          if (CS.doesNotReturn()) NoReturnCalls.push_back(CS.getInstruction());
+        }
+        continue;
+      }
+      ToInstrument.push_back(&Inst);
+      NumInsnsPerBB++;
+      if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break;
+    }
+  }
+
+  bool UseCalls =
+      CompileKernel ||
+      (ClInstrumentationWithCallsThreshold >= 0 &&
+       ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold);
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(),
+                                     /*RoundToAlign=*/true);
+
+  // Instrument.
+  int NumInstrumented = 0;
+  for (auto Inst : ToInstrument) {
+    if (ClDebugMin < 0 || ClDebugMax < 0 ||
+        (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
+      if (isInterestingMemoryAccess(Inst, &IsWrite, &TypeSize, &Alignment))
+        instrumentMop(ObjSizeVis, Inst, UseCalls,
+                      F.getParent()->getDataLayout());
+      else
+        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+    }
+    NumInstrumented++;
+  }
+
+  FunctionStackPoisoner FSP(F, *this);
+  bool ChangedStack = FSP.runOnFunction();
+
+  // We must unpoison the stack before every NoReturn call (throw, _exit, etc).
+  // See e.g. http://code.google.com/p/address-sanitizer/issues/detail?id=37
+  for (auto CI : NoReturnCalls) {
+    IRBuilder<> IRB(CI);
+    IRB.CreateCall(AsanHandleNoReturnFunc, {});
+  }
+
+  for (auto Inst : PointerComparisonsOrSubtracts) {
+    instrumentPointerComparisonOrSubtraction(Inst);
+    NumInstrumented++;
+  }
+
+  bool res = NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty();
+
+  DEBUG(dbgs() << "ASAN done instrumenting: " << res << " " << F << "\n");
+
+  return res;
+}
+
+// Workaround for bug 11395: we don't want to instrument stack in functions
+// with large assembly blobs (32-bit only), otherwise reg alloc may crash.
+// FIXME: remove once the bug 11395 is fixed.
+bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
+  if (LongSize != 32) return false;
+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->isInlineAsm()) return false;
+  if (CI->getNumArgOperands() <= 5) return false;
+  // We have inline assembly with quite a few arguments.
+  return true;
+}
+
+void FunctionStackPoisoner::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) {
+    std::string Suffix = itostr(i);
+    AsanStackMallocFunc[i] = checkSanitizerInterfaceFunction(
+        M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy,
+                              IntptrTy, nullptr));
+    AsanStackFreeFunc[i] = checkSanitizerInterfaceFunction(
+        M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix,
+                              IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+  }
+  AsanPoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kAsanPoisonStackMemoryName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, nullptr));
+  AsanUnpoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kAsanUnpoisonStackMemoryName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, nullptr));
+  AsanAllocaPoisonFunc = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+  AsanAllocasUnpoisonFunc =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+}
+
+void FunctionStackPoisoner::poisonRedZones(ArrayRef<uint8_t> ShadowBytes,
+                                           IRBuilder<> &IRB, Value *ShadowBase,
+                                           bool DoPoison) {
+  size_t n = ShadowBytes.size();
+  size_t i = 0;
+  // We need to (un)poison n bytes of stack shadow. Poison as many as we can
+  // using 64-bit stores (if we are on 64-bit arch), then poison the rest
+  // with 32-bit stores, then with 16-byte stores, then with 8-byte stores.
+  for (size_t LargeStoreSizeInBytes = ASan.LongSize / 8;
+       LargeStoreSizeInBytes != 0; LargeStoreSizeInBytes /= 2) {
+    for (; i + LargeStoreSizeInBytes - 1 < n; i += LargeStoreSizeInBytes) {
+      uint64_t Val = 0;
+      for (size_t j = 0; j < LargeStoreSizeInBytes; j++) {
+        if (F.getParent()->getDataLayout().isLittleEndian())
+          Val |= (uint64_t)ShadowBytes[i + j] << (8 * j);
+        else
+          Val = (Val << 8) | ShadowBytes[i + j];
+      }
+      if (!Val) continue;
+      Value *Ptr = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
+      Type *StoreTy = Type::getIntNTy(*C, LargeStoreSizeInBytes * 8);
+      Value *Poison = ConstantInt::get(StoreTy, DoPoison ? Val : 0);
+      IRB.CreateStore(Poison, IRB.CreateIntToPtr(Ptr, StoreTy->getPointerTo()));
+    }
+  }
+}
+
+// Fake stack allocator (asan_fake_stack.h) has 11 size classes
+// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass
+static int StackMallocSizeClass(uint64_t LocalStackSize) {
+  assert(LocalStackSize <= kMaxStackMallocSize);
+  uint64_t MaxSize = kMinStackMallocSize;
+  for (int i = 0;; i++, MaxSize *= 2)
+    if (LocalStackSize <= MaxSize) return i;
+  llvm_unreachable("impossible LocalStackSize");
+}
+
+// Set Size bytes starting from ShadowBase to kAsanStackAfterReturnMagic.
+// We can not use MemSet intrinsic because it may end up calling the actual
+// memset. Size is a multiple of 8.
+// Currently this generates 8-byte stores on x86_64; it may be better to
+// generate wider stores.
+void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined(
+    IRBuilder<> &IRB, Value *ShadowBase, int Size) {
+  assert(!(Size % 8));
+
+  // kAsanStackAfterReturnMagic is 0xf5.
+  const uint64_t kAsanStackAfterReturnMagic64 = 0xf5f5f5f5f5f5f5f5ULL;
+
+  for (int i = 0; i < Size; i += 8) {
+    Value *p = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
+    IRB.CreateStore(
+        ConstantInt::get(IRB.getInt64Ty(), kAsanStackAfterReturnMagic64),
+        IRB.CreateIntToPtr(p, IRB.getInt64Ty()->getPointerTo()));
+  }
+}
+
+PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
+                                          Value *ValueIfTrue,
+                                          Instruction *ThenTerm,
+                                          Value *ValueIfFalse) {
+  PHINode *PHI = IRB.CreatePHI(IntptrTy, 2);
+  BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent();
+  PHI->addIncoming(ValueIfFalse, CondBlock);
+  BasicBlock *ThenBlock = ThenTerm->getParent();
+  PHI->addIncoming(ValueIfTrue, ThenBlock);
+  return PHI;
+}
+
+Value *FunctionStackPoisoner::createAllocaForLayout(
+    IRBuilder<> &IRB, const ASanStackFrameLayout &L, bool Dynamic) {
+  AllocaInst *Alloca;
+  if (Dynamic) {
+    Alloca = IRB.CreateAlloca(IRB.getInt8Ty(),
+                              ConstantInt::get(IRB.getInt64Ty(), L.FrameSize),
+                              "MyAlloca");
+  } else {
+    Alloca = IRB.CreateAlloca(ArrayType::get(IRB.getInt8Ty(), L.FrameSize),
+                              nullptr, "MyAlloca");
+    assert(Alloca->isStaticAlloca());
+  }
+  assert((ClRealignStack & (ClRealignStack - 1)) == 0);
+  size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
+  Alloca->setAlignment(FrameAlignment);
+  return IRB.CreatePointerCast(Alloca, IntptrTy);
+}
+
+void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
+  BasicBlock &FirstBB = *F.begin();
+  IRBuilder<> IRB(dyn_cast<Instruction>(FirstBB.begin()));
+  DynamicAllocaLayout = IRB.CreateAlloca(IntptrTy, nullptr);
+  IRB.CreateStore(Constant::getNullValue(IntptrTy), DynamicAllocaLayout);
+  DynamicAllocaLayout->setAlignment(32);
+}
+
+void FunctionStackPoisoner::poisonStack() {
+  assert(AllocaVec.size() > 0 || DynamicAllocaVec.size() > 0);
+
+  if (ClInstrumentAllocas && DynamicAllocaVec.size() > 0) {
+    // Handle dynamic allocas.
+    createDynamicAllocasInitStorage();
+    for (auto &AI : DynamicAllocaVec) handleDynamicAllocaCall(AI);
+
+    unpoisonDynamicAllocas();
+  }
+
+  if (AllocaVec.size() == 0) return;
+
+  int StackMallocIdx = -1;
+  DebugLoc EntryDebugLocation;
+  if (auto SP = getDISubprogram(&F))
+    EntryDebugLocation = DebugLoc::get(SP->getScopeLine(), 0, SP);
+
+  Instruction *InsBefore = AllocaVec[0];
+  IRBuilder<> IRB(InsBefore);
+  IRB.SetCurrentDebugLocation(EntryDebugLocation);
+
+  SmallVector<ASanStackVariableDescription, 16> SVD;
+  SVD.reserve(AllocaVec.size());
+  for (AllocaInst *AI : AllocaVec) {
+    ASanStackVariableDescription D = {AI->getName().data(),
+                                      ASan.getAllocaSizeInBytes(AI),
+                                      AI->getAlignment(), AI, 0};
+    SVD.push_back(D);
+  }
+  // Minimal header size (left redzone) is 4 pointers,
+  // i.e. 32 bytes on 64-bit platforms and 16 bytes in 32-bit platforms.
+  size_t MinHeaderSize = ASan.LongSize / 2;
+  ASanStackFrameLayout L;
+  ComputeASanStackFrameLayout(SVD, 1UL << Mapping.Scale, MinHeaderSize, &L);
+  DEBUG(dbgs() << L.DescriptionString << " --- " << L.FrameSize << "\n");
+  uint64_t LocalStackSize = L.FrameSize;
+  bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&
+                       LocalStackSize <= kMaxStackMallocSize;
+  // Don't do dynamic alloca or stack malloc in presence of inline asm:
+  // too often it makes assumptions on which registers are available.
+  bool DoDynamicAlloca = ClDynamicAllocaStack && !HasNonEmptyInlineAsm;
+  DoStackMalloc &= !HasNonEmptyInlineAsm;
+
+  Value *StaticAlloca =
+      DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);
+
+  Value *FakeStack;
+  Value *LocalStackBase;
+
+  if (DoStackMalloc) {
+    // void *FakeStack = __asan_option_detect_stack_use_after_return
+    //     ? __asan_stack_malloc_N(LocalStackSize)
+    //     : nullptr;
+    // void *LocalStackBase = (FakeStack) ? FakeStack : alloca(LocalStackSize);
+    Constant *OptionDetectUAR = F.getParent()->getOrInsertGlobal(
+        kAsanOptionDetectUAR, IRB.getInt32Ty());
+    Value *UARIsEnabled =
+        IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUAR),
+                         Constant::getNullValue(IRB.getInt32Ty()));
+    Instruction *Term =
+        SplitBlockAndInsertIfThen(UARIsEnabled, InsBefore, false);
+    IRBuilder<> IRBIf(Term);
+    IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
+    StackMallocIdx = StackMallocSizeClass(LocalStackSize);
+    assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
+    Value *FakeStackValue =
+        IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx],
+                         ConstantInt::get(IntptrTy, LocalStackSize));
+    IRB.SetInsertPoint(InsBefore);
+    IRB.SetCurrentDebugLocation(EntryDebugLocation);
+    FakeStack = createPHI(IRB, UARIsEnabled, FakeStackValue, Term,
+                          ConstantInt::get(IntptrTy, 0));
+
+    Value *NoFakeStack =
+        IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
+    Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
+    IRBIf.SetInsertPoint(Term);
+    IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
+    Value *AllocaValue =
+        DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
+    IRB.SetInsertPoint(InsBefore);
+    IRB.SetCurrentDebugLocation(EntryDebugLocation);
+    LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
+  } else {
+    // void *FakeStack = nullptr;
+    // void *LocalStackBase = alloca(LocalStackSize);
+    FakeStack = ConstantInt::get(IntptrTy, 0);
+    LocalStackBase =
+        DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;
+  }
+
+  // Insert poison calls for lifetime intrinsics for alloca.
+  bool HavePoisonedAllocas = false;
+  for (const auto &APC : AllocaPoisonCallVec) {
+    assert(APC.InsBefore);
+    assert(APC.AI);
+    IRBuilder<> IRB(APC.InsBefore);
+    poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
+    HavePoisonedAllocas |= APC.DoPoison;
+  }
+
+  // Replace Alloca instructions with base+offset.
+  for (const auto &Desc : SVD) {
+    AllocaInst *AI = Desc.AI;
+    Value *NewAllocaPtr = IRB.CreateIntToPtr(
+        IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
+        AI->getType());
+    replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB, /*Deref=*/true);
+    AI->replaceAllUsesWith(NewAllocaPtr);
+  }
+
+  // The left-most redzone has enough space for at least 4 pointers.
+  // Write the Magic value to redzone[0].
+  Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
+  IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
+                  BasePlus0);
+  // Write the frame description constant to redzone[1].
+  Value *BasePlus1 = IRB.CreateIntToPtr(
+      IRB.CreateAdd(LocalStackBase,
+                    ConstantInt::get(IntptrTy, ASan.LongSize / 8)),
+      IntptrPtrTy);
+  GlobalVariable *StackDescriptionGlobal =
+      createPrivateGlobalForString(*F.getParent(), L.DescriptionString,
+                                   /*AllowMerging*/ true);
+  Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
+  IRB.CreateStore(Description, BasePlus1);
+  // Write the PC to redzone[2].
+  Value *BasePlus2 = IRB.CreateIntToPtr(
+      IRB.CreateAdd(LocalStackBase,
+                    ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)),
+      IntptrPtrTy);
+  IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
+
+  // Poison the stack redzones at the entry.
+  Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
+  poisonRedZones(L.ShadowBytes, IRB, ShadowBase, true);
+
+  // (Un)poison the stack before all ret instructions.
+  for (auto Ret : RetVec) {
+    IRBuilder<> IRBRet(Ret);
+    // Mark the current frame as retired.
+    IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
+                       BasePlus0);
+    if (DoStackMalloc) {
+      assert(StackMallocIdx >= 0);
+      // if FakeStack != 0  // LocalStackBase == FakeStack
+      //     // In use-after-return mode, poison the whole stack frame.
+      //     if StackMallocIdx <= 4
+      //         // For small sizes inline the whole thing:
+      //         memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize);
+      //         **SavedFlagPtr(FakeStack) = 0
+      //     else
+      //         __asan_stack_free_N(FakeStack, LocalStackSize)
+      // else
+      //     <This is not a fake stack; unpoison the redzones>
+      Value *Cmp =
+          IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
+      TerminatorInst *ThenTerm, *ElseTerm;
+      SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);
+
+      IRBuilder<> IRBPoison(ThenTerm);
+      if (StackMallocIdx <= 4) {
+        int ClassSize = kMinStackMallocSize << StackMallocIdx;
+        SetShadowToStackAfterReturnInlined(IRBPoison, ShadowBase,
+                                           ClassSize >> Mapping.Scale);
+        Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
+            FakeStack,
+            ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
+        Value *SavedFlagPtr = IRBPoison.CreateLoad(
+            IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
+        IRBPoison.CreateStore(
+            Constant::getNullValue(IRBPoison.getInt8Ty()),
+            IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy()));
+      } else {
+        // For larger frames call __asan_stack_free_*.
+        IRBPoison.CreateCall(
+            AsanStackFreeFunc[StackMallocIdx],
+            {FakeStack, ConstantInt::get(IntptrTy, LocalStackSize)});
+      }
+
+      IRBuilder<> IRBElse(ElseTerm);
+      poisonRedZones(L.ShadowBytes, IRBElse, ShadowBase, false);
+    } else if (HavePoisonedAllocas) {
+      // If we poisoned some allocas in llvm.lifetime analysis,
+      // unpoison whole stack frame now.
+      poisonAlloca(LocalStackBase, LocalStackSize, IRBRet, false);
+    } else {
+      poisonRedZones(L.ShadowBytes, IRBRet, ShadowBase, false);
+    }
+  }
+
+  // We are done. Remove the old unused alloca instructions.
+  for (auto AI : AllocaVec) AI->eraseFromParent();
+}
+
+void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
+                                         IRBuilder<> &IRB, bool DoPoison) {
+  // For now just insert the call to ASan runtime.
+  Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
+  Value *SizeArg = ConstantInt::get(IntptrTy, Size);
+  IRB.CreateCall(
+      DoPoison ? AsanPoisonStackMemoryFunc : AsanUnpoisonStackMemoryFunc,
+      {AddrArg, SizeArg});
+}
+
+// Handling llvm.lifetime intrinsics for a given %alloca:
+// (1) collect all llvm.lifetime.xxx(%size, %value) describing the alloca.
+// (2) if %size is constant, poison memory for llvm.lifetime.end (to detect
+//     invalid accesses) and unpoison it for llvm.lifetime.start (the memory
+//     could be poisoned by previous llvm.lifetime.end instruction, as the
+//     variable may go in and out of scope several times, e.g. in loops).
+// (3) if we poisoned at least one %alloca in a function,
+//     unpoison the whole stack frame at function exit.
+
+AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(V))
+    // We're intested only in allocas we can handle.
+    return ASan.isInterestingAlloca(*AI) ? AI : nullptr;
+  // See if we've already calculated (or started to calculate) alloca for a
+  // given value.
+  AllocaForValueMapTy::iterator I = AllocaForValue.find(V);
+  if (I != AllocaForValue.end()) return I->second;
+  // Store 0 while we're calculating alloca for value V to avoid
+  // infinite recursion if the value references itself.
+  AllocaForValue[V] = nullptr;
+  AllocaInst *Res = nullptr;
+  if (CastInst *CI = dyn_cast<CastInst>(V))
+    Res = findAllocaForValue(CI->getOperand(0));
+  else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    for (Value *IncValue : PN->incoming_values()) {
+      // Allow self-referencing phi-nodes.
+      if (IncValue == PN) continue;
+      AllocaInst *IncValueAI = findAllocaForValue(IncValue);
+      // AI for incoming values should exist and should all be equal.
+      if (IncValueAI == nullptr || (Res != nullptr && IncValueAI != Res))
+        return nullptr;
+      Res = IncValueAI;
+    }
+  }
+  if (Res) AllocaForValue[V] = Res;
+  return Res;
+}
+
+void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
+  IRBuilder<> IRB(AI);
+
+  const unsigned Align = std::max(kAllocaRzSize, AI->getAlignment());
+  const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;
+
+  Value *Zero = Constant::getNullValue(IntptrTy);
+  Value *AllocaRzSize = ConstantInt::get(IntptrTy, kAllocaRzSize);
+  Value *AllocaRzMask = ConstantInt::get(IntptrTy, AllocaRedzoneMask);
+
+  // Since we need to extend alloca with additional memory to locate
+  // redzones, and OldSize is number of allocated blocks with
+  // ElementSize size, get allocated memory size in bytes by
+  // OldSize * ElementSize.
+  const unsigned ElementSize =
+      F.getParent()->getDataLayout().getTypeAllocSize(AI->getAllocatedType());
+  Value *OldSize =
+      IRB.CreateMul(IRB.CreateIntCast(AI->getArraySize(), IntptrTy, false),
+                    ConstantInt::get(IntptrTy, ElementSize));
+
+  // PartialSize = OldSize % 32
+  Value *PartialSize = IRB.CreateAnd(OldSize, AllocaRzMask);
+
+  // Misalign = kAllocaRzSize - PartialSize;
+  Value *Misalign = IRB.CreateSub(AllocaRzSize, PartialSize);
+
+  // PartialPadding = Misalign != kAllocaRzSize ? Misalign : 0;
+  Value *Cond = IRB.CreateICmpNE(Misalign, AllocaRzSize);
+  Value *PartialPadding = IRB.CreateSelect(Cond, Misalign, Zero);
+
+  // AdditionalChunkSize = Align + PartialPadding + kAllocaRzSize
+  // Align is added to locate left redzone, PartialPadding for possible
+  // partial redzone and kAllocaRzSize for right redzone respectively.
+  Value *AdditionalChunkSize = IRB.CreateAdd(
+      ConstantInt::get(IntptrTy, Align + kAllocaRzSize), PartialPadding);
+
+  Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize);
+
+  // Insert new alloca with new NewSize and Align params.
+  AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
+  NewAlloca->setAlignment(Align);
+
+  // NewAddress = Address + Align
+  Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
+                                    ConstantInt::get(IntptrTy, Align));
+
+  // Insert __asan_alloca_poison call for new created alloca.
+  IRB.CreateCall(AsanAllocaPoisonFunc, {NewAddress, OldSize});
+
+  // Store the last alloca's address to DynamicAllocaLayout. We'll need this
+  // for unpoisoning stuff.
+  IRB.CreateStore(IRB.CreatePtrToInt(NewAlloca, IntptrTy), DynamicAllocaLayout);
+
+  Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType());
+
+  // Replace all uses of AddessReturnedByAlloca with NewAddressPtr.
+  AI->replaceAllUsesWith(NewAddressPtr);
+
+  // We are done. Erase old alloca from parent.
+  AI->eraseFromParent();
+}
+
+// isSafeAccess returns true if Addr is always inbounds with respect to its
+// base object. For example, it is a field access or an array access with
+// constant inbounds index.
+bool AddressSanitizer::isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis,
+                                    Value *Addr, uint64_t TypeSize) const {
+  SizeOffsetType SizeOffset = ObjSizeVis.compute(Addr);
+  if (!ObjSizeVis.bothKnown(SizeOffset)) return false;
+  uint64_t Size = SizeOffset.first.getZExtValue();
+  int64_t Offset = SizeOffset.second.getSExtValue();
+  // Three checks are required to ensure safety:
+  // . Offset >= 0  (since the offset is given from the base ptr)
+  // . Size >= Offset  (unsigned)
+  // . Size - Offset >= NeededSize  (unsigned)
+  return Offset >= 0 && Size >= uint64_t(Offset) &&
+         Size - uint64_t(Offset) >= TypeSize / 8;
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
new file mode 100644
index 0000000..f685803
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -0,0 +1,213 @@
+//===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass that instruments the code to perform run-time
+// bounds checking on loads, stores, and other memory intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bounds-checking"
+
+static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
+                                  cl::desc("Use one trap block per function"));
+
+STATISTIC(ChecksAdded, "Bounds checks added");
+STATISTIC(ChecksSkipped, "Bounds checks skipped");
+STATISTIC(ChecksUnable, "Bounds checks unable to add");
+
+typedef IRBuilder<true, TargetFolder> BuilderTy;
+
+namespace {
+  struct BoundsChecking : public FunctionPass {
+    static char ID;
+
+    BoundsChecking() : FunctionPass(ID) {
+      initializeBoundsCheckingPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+    }
+
+  private:
+    const TargetLibraryInfo *TLI;
+    ObjectSizeOffsetEvaluator *ObjSizeEval;
+    BuilderTy *Builder;
+    Instruction *Inst;
+    BasicBlock *TrapBB;
+
+    BasicBlock *getTrapBB();
+    void emitBranchToTrap(Value *Cmp = nullptr);
+    bool instrument(Value *Ptr, Value *Val, const DataLayout &DL);
+ };
+}
+
+char BoundsChecking::ID = 0;
+INITIALIZE_PASS(BoundsChecking, "bounds-checking", "Run-time bounds checking",
+                false, false)
+
+
+/// getTrapBB - create a basic block that traps. All overflowing conditions
+/// branch to this block. There's only one trap block per function.
+BasicBlock *BoundsChecking::getTrapBB() {
+  if (TrapBB && SingleTrapBB)
+    return TrapBB;
+
+  Function *Fn = Inst->getParent()->getParent();
+  IRBuilder<>::InsertPointGuard Guard(*Builder);
+  TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
+  Builder->SetInsertPoint(TrapBB);
+
+  llvm::Value *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap);
+  CallInst *TrapCall = Builder->CreateCall(F, {});
+  TrapCall->setDoesNotReturn();
+  TrapCall->setDoesNotThrow();
+  TrapCall->setDebugLoc(Inst->getDebugLoc());
+  Builder->CreateUnreachable();
+
+  return TrapBB;
+}
+
+
+/// emitBranchToTrap - emit a branch instruction to a trap block.
+/// If Cmp is non-null, perform a jump only if its value evaluates to true.
+void BoundsChecking::emitBranchToTrap(Value *Cmp) {
+  // check if the comparison is always false
+  ConstantInt *C = dyn_cast_or_null<ConstantInt>(Cmp);
+  if (C) {
+    ++ChecksSkipped;
+    if (!C->getZExtValue())
+      return;
+    else
+      Cmp = nullptr; // unconditional branch
+  }
+  ++ChecksAdded;
+
+  Instruction *Inst = Builder->GetInsertPoint();
+  BasicBlock *OldBB = Inst->getParent();
+  BasicBlock *Cont = OldBB->splitBasicBlock(Inst);
+  OldBB->getTerminator()->eraseFromParent();
+
+  if (Cmp)
+    BranchInst::Create(getTrapBB(), Cont, Cmp, OldBB);
+  else
+    BranchInst::Create(getTrapBB(), OldBB);
+}
+
+
+/// instrument - adds run-time bounds checks to memory accessing instructions.
+/// Ptr is the pointer that will be read/written, and InstVal is either the
+/// result from the load or the value being stored. It is used to determine the
+/// size of memory block that is touched.
+/// Returns true if any change was made to the IR, false otherwise.
+bool BoundsChecking::instrument(Value *Ptr, Value *InstVal,
+                                const DataLayout &DL) {
+  uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType());
+  DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
+              << " bytes\n");
+
+  SizeOffsetEvalType SizeOffset = ObjSizeEval->compute(Ptr);
+
+  if (!ObjSizeEval->bothKnown(SizeOffset)) {
+    ++ChecksUnable;
+    return false;
+  }
+
+  Value *Size   = SizeOffset.first;
+  Value *Offset = SizeOffset.second;
+  ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
+
+  Type *IntTy = DL.getIntPtrType(Ptr->getType());
+  Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
+
+  // three checks are required to ensure safety:
+  // . Offset >= 0  (since the offset is given from the base ptr)
+  // . Size >= Offset  (unsigned)
+  // . Size - Offset >= NeededSize  (unsigned)
+  //
+  // optimization: if Size >= 0 (signed), skip 1st check
+  // FIXME: add NSW/NUW here?  -- we dont care if the subtraction overflows
+  Value *ObjSize = Builder->CreateSub(Size, Offset);
+  Value *Cmp2 = Builder->CreateICmpULT(Size, Offset);
+  Value *Cmp3 = Builder->CreateICmpULT(ObjSize, NeededSizeVal);
+  Value *Or = Builder->CreateOr(Cmp2, Cmp3);
+  if (!SizeCI || SizeCI->getValue().slt(0)) {
+    Value *Cmp1 = Builder->CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0));
+    Or = Builder->CreateOr(Cmp1, Or);
+  }
+  emitBranchToTrap(Or);
+
+  return true;
+}
+
+bool BoundsChecking::runOnFunction(Function &F) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  TrapBB = nullptr;
+  BuilderTy TheBuilder(F.getContext(), TargetFolder(DL));
+  Builder = &TheBuilder;
+  ObjectSizeOffsetEvaluator TheObjSizeEval(DL, TLI, F.getContext(),
+                                           /*RoundToAlign=*/true);
+  ObjSizeEval = &TheObjSizeEval;
+
+  // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
+  // touching instructions
+  std::vector<Instruction*> WorkList;
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &*i;
+    if (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<AtomicCmpXchgInst>(I) ||
+        isa<AtomicRMWInst>(I))
+        WorkList.push_back(I);
+  }
+
+  bool MadeChange = false;
+  for (std::vector<Instruction*>::iterator i = WorkList.begin(),
+       e = WorkList.end(); i != e; ++i) {
+    Inst = *i;
+
+    Builder->SetInsertPoint(Inst);
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+      MadeChange |= instrument(LI->getPointerOperand(), LI, DL);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      MadeChange |=
+          instrument(SI->getPointerOperand(), SI->getValueOperand(), DL);
+    } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+      MadeChange |=
+          instrument(AI->getPointerOperand(), AI->getCompareOperand(), DL);
+    } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) {
+      MadeChange |=
+          instrument(AI->getPointerOperand(), AI->getValOperand(), DL);
+    } else {
+      llvm_unreachable("unknown Instruction type");
+    }
+  }
+  return MadeChange;
+}
+
+FunctionPass *llvm::createBoundsCheckingPass() {
+  return new BoundsChecking();
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
new file mode 100644
index 0000000..2de6e1a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -0,0 +1,1623 @@
+//===-- DataFlowSanitizer.cpp - dynamic data flow analysis ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow
+/// analysis.
+///
+/// Unlike other Sanitizer tools, this tool is not designed to detect a specific
+/// class of bugs on its own.  Instead, it provides a generic dynamic data flow
+/// analysis framework to be used by clients to help detect application-specific
+/// issues within their own code.
+///
+/// The analysis is based on automatic propagation of data flow labels (also
+/// known as taint labels) through a program as it performs computation.  Each
+/// byte of application memory is backed by two bytes of shadow memory which
+/// hold the label.  On Linux/x86_64, memory is laid out as follows:
+///
+/// +--------------------+ 0x800000000000 (top of memory)
+/// | application memory |
+/// +--------------------+ 0x700000008000 (kAppAddr)
+/// |                    |
+/// |       unused       |
+/// |                    |
+/// +--------------------+ 0x200200000000 (kUnusedAddr)
+/// |    union table     |
+/// +--------------------+ 0x200000000000 (kUnionTableAddr)
+/// |   shadow memory    |
+/// +--------------------+ 0x000000010000 (kShadowAddr)
+/// | reserved by kernel |
+/// +--------------------+ 0x000000000000
+///
+/// To derive a shadow memory address from an application memory address,
+/// bits 44-46 are cleared to bring the address into the range
+/// [0x000000008000,0x100000000000).  Then the address is shifted left by 1 to
+/// account for the double byte representation of shadow labels and move the
+/// address into the shadow memory range.  See the function
+/// DataFlowSanitizer::getShadowAddress below.
+///
+/// For more information, please refer to the design document:
+/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <iterator>
+#include <set>
+#include <utility>
+
+using namespace llvm;
+
+// The -dfsan-preserve-alignment flag controls whether this pass assumes that
+// alignment requirements provided by the input IR are correct.  For example,
+// if the input IR contains a load with alignment 8, this flag will cause
+// the shadow load to have alignment 16.  This flag is disabled by default as
+// we have unfortunately encountered too much code (including Clang itself;
+// see PR14291) which performs misaligned access.
+static cl::opt<bool> ClPreserveAlignment(
+    "dfsan-preserve-alignment",
+    cl::desc("respect alignment requirements provided by input IR"), cl::Hidden,
+    cl::init(false));
+
+// The ABI list files control how shadow parameters are passed. The pass treats
+// every function labelled "uninstrumented" in the ABI list file as conforming
+// to the "native" (i.e. unsanitized) ABI.  Unless the ABI list contains
+// additional annotations for those functions, a call to one of those functions
+// will produce a warning message, as the labelling behaviour of the function is
+// unknown.  The other supported annotations are "functional" and "discard",
+// which are described below under DataFlowSanitizer::WrapperKind.
+static cl::list<std::string> ClABIListFiles(
+    "dfsan-abilist",
+    cl::desc("File listing native ABI functions and how the pass treats them"),
+    cl::Hidden);
+
+// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented
+// functions (see DataFlowSanitizer::InstrumentedABI below).
+static cl::opt<bool> ClArgsABI(
+    "dfsan-args-abi",
+    cl::desc("Use the argument ABI rather than the TLS ABI"),
+    cl::Hidden);
+
+// Controls whether the pass includes or ignores the labels of pointers in load
+// instructions.
+static cl::opt<bool> ClCombinePointerLabelsOnLoad(
+    "dfsan-combine-pointer-labels-on-load",
+    cl::desc("Combine the label of the pointer with the label of the data when "
+             "loading from memory."),
+    cl::Hidden, cl::init(true));
+
+// Controls whether the pass includes or ignores the labels of pointers in
+// stores instructions.
+static cl::opt<bool> ClCombinePointerLabelsOnStore(
+    "dfsan-combine-pointer-labels-on-store",
+    cl::desc("Combine the label of the pointer with the label of the data when "
+             "storing in memory."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDebugNonzeroLabels(
+    "dfsan-debug-nonzero-labels",
+    cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, "
+             "load or return with a nonzero label"),
+    cl::Hidden);
+
+namespace {
+
+StringRef GetGlobalTypeString(const GlobalValue &G) {
+  // Types of GlobalVariables are always pointer types.
+  Type *GType = G.getType()->getElementType();
+  // For now we support blacklisting struct types only.
+  if (StructType *SGType = dyn_cast<StructType>(GType)) {
+    if (!SGType->isLiteral())
+      return SGType->getName();
+  }
+  return "<unknown type>";
+}
+
+class DFSanABIList {
+  std::unique_ptr<SpecialCaseList> SCL;
+
+ public:
+  DFSanABIList() {}
+
+  void set(std::unique_ptr<SpecialCaseList> List) { SCL = std::move(List); }
+
+  /// Returns whether either this function or its source file are listed in the
+  /// given category.
+  bool isIn(const Function &F, StringRef Category) const {
+    return isIn(*F.getParent(), Category) ||
+           SCL->inSection("fun", F.getName(), Category);
+  }
+
+  /// Returns whether this global alias is listed in the given category.
+  ///
+  /// If GA aliases a function, the alias's name is matched as a function name
+  /// would be.  Similarly, aliases of globals are matched like globals.
+  bool isIn(const GlobalAlias &GA, StringRef Category) const {
+    if (isIn(*GA.getParent(), Category))
+      return true;
+
+    if (isa<FunctionType>(GA.getType()->getElementType()))
+      return SCL->inSection("fun", GA.getName(), Category);
+
+    return SCL->inSection("global", GA.getName(), Category) ||
+           SCL->inSection("type", GetGlobalTypeString(GA), Category);
+  }
+
+  /// Returns whether this module is listed in the given category.
+  bool isIn(const Module &M, StringRef Category) const {
+    return SCL->inSection("src", M.getModuleIdentifier(), Category);
+  }
+};
+
+class DataFlowSanitizer : public ModulePass {
+  friend struct DFSanFunction;
+  friend class DFSanVisitor;
+
+  enum {
+    ShadowWidth = 16
+  };
+
+  /// Which ABI should be used for instrumented functions?
+  enum InstrumentedABI {
+    /// Argument and return value labels are passed through additional
+    /// arguments and by modifying the return type.
+    IA_Args,
+
+    /// Argument and return value labels are passed through TLS variables
+    /// __dfsan_arg_tls and __dfsan_retval_tls.
+    IA_TLS
+  };
+
+  /// How should calls to uninstrumented functions be handled?
+  enum WrapperKind {
+    /// This function is present in an uninstrumented form but we don't know
+    /// how it should be handled.  Print a warning and call the function anyway.
+    /// Don't label the return value.
+    WK_Warning,
+
+    /// This function does not write to (user-accessible) memory, and its return
+    /// value is unlabelled.
+    WK_Discard,
+
+    /// This function does not write to (user-accessible) memory, and the label
+    /// of its return value is the union of the label of its arguments.
+    WK_Functional,
+
+    /// Instead of calling the function, a custom wrapper __dfsw_F is called,
+    /// where F is the name of the function.  This function may wrap the
+    /// original function or provide its own implementation.  This is similar to
+    /// the IA_Args ABI, except that IA_Args uses a struct return type to
+    /// pass the return value shadow in a register, while WK_Custom uses an
+    /// extra pointer argument to return the shadow.  This allows the wrapped
+    /// form of the function type to be expressed in C.
+    WK_Custom
+  };
+
+  Module *Mod;
+  LLVMContext *Ctx;
+  IntegerType *ShadowTy;
+  PointerType *ShadowPtrTy;
+  IntegerType *IntptrTy;
+  ConstantInt *ZeroShadow;
+  ConstantInt *ShadowPtrMask;
+  ConstantInt *ShadowPtrMul;
+  Constant *ArgTLS;
+  Constant *RetvalTLS;
+  void *(*GetArgTLSPtr)();
+  void *(*GetRetvalTLSPtr)();
+  Constant *GetArgTLS;
+  Constant *GetRetvalTLS;
+  FunctionType *DFSanUnionFnTy;
+  FunctionType *DFSanUnionLoadFnTy;
+  FunctionType *DFSanUnimplementedFnTy;
+  FunctionType *DFSanSetLabelFnTy;
+  FunctionType *DFSanNonzeroLabelFnTy;
+  FunctionType *DFSanVarargWrapperFnTy;
+  Constant *DFSanUnionFn;
+  Constant *DFSanCheckedUnionFn;
+  Constant *DFSanUnionLoadFn;
+  Constant *DFSanUnimplementedFn;
+  Constant *DFSanSetLabelFn;
+  Constant *DFSanNonzeroLabelFn;
+  Constant *DFSanVarargWrapperFn;
+  MDNode *ColdCallWeights;
+  DFSanABIList ABIList;
+  DenseMap<Value *, Function *> UnwrappedFnMap;
+  AttributeSet ReadOnlyNoneAttrs;
+  DenseMap<const Function *, DISubprogram *> FunctionDIs;
+
+  Value *getShadowAddress(Value *Addr, Instruction *Pos);
+  bool isInstrumented(const Function *F);
+  bool isInstrumented(const GlobalAlias *GA);
+  FunctionType *getArgsFunctionType(FunctionType *T);
+  FunctionType *getTrampolineFunctionType(FunctionType *T);
+  FunctionType *getCustomFunctionType(FunctionType *T);
+  InstrumentedABI getInstrumentedABI();
+  WrapperKind getWrapperKind(Function *F);
+  void addGlobalNamePrefix(GlobalValue *GV);
+  Function *buildWrapperFunction(Function *F, StringRef NewFName,
+                                 GlobalValue::LinkageTypes NewFLink,
+                                 FunctionType *NewFT);
+  Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName);
+
+ public:
+  DataFlowSanitizer(
+      const std::vector<std::string> &ABIListFiles = std::vector<std::string>(),
+      void *(*getArgTLS)() = nullptr, void *(*getRetValTLS)() = nullptr);
+  static char ID;
+  bool doInitialization(Module &M) override;
+  bool runOnModule(Module &M) override;
+};
+
+struct DFSanFunction {
+  DataFlowSanitizer &DFS;
+  Function *F;
+  DominatorTree DT;
+  DataFlowSanitizer::InstrumentedABI IA;
+  bool IsNativeABI;
+  Value *ArgTLSPtr;
+  Value *RetvalTLSPtr;
+  AllocaInst *LabelReturnAlloca;
+  DenseMap<Value *, Value *> ValShadowMap;
+  DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap;
+  std::vector<std::pair<PHINode *, PHINode *> > PHIFixups;
+  DenseSet<Instruction *> SkipInsts;
+  std::vector<Value *> NonZeroChecks;
+  bool AvoidNewBlocks;
+
+  struct CachedCombinedShadow {
+    BasicBlock *Block;
+    Value *Shadow;
+  };
+  DenseMap<std::pair<Value *, Value *>, CachedCombinedShadow>
+      CachedCombinedShadows;
+  DenseMap<Value *, std::set<Value *>> ShadowElements;
+
+  DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
+      : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()),
+        IsNativeABI(IsNativeABI), ArgTLSPtr(nullptr), RetvalTLSPtr(nullptr),
+        LabelReturnAlloca(nullptr) {
+    DT.recalculate(*F);
+    // FIXME: Need to track down the register allocator issue which causes poor
+    // performance in pathological cases with large numbers of basic blocks.
+    AvoidNewBlocks = F->size() > 1000;
+  }
+  Value *getArgTLSPtr();
+  Value *getArgTLS(unsigned Index, Instruction *Pos);
+  Value *getRetvalTLS();
+  Value *getShadow(Value *V);
+  void setShadow(Instruction *I, Value *Shadow);
+  Value *combineShadows(Value *V1, Value *V2, Instruction *Pos);
+  Value *combineOperandShadows(Instruction *Inst);
+  Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
+                    Instruction *Pos);
+  void storeShadow(Value *Addr, uint64_t Size, uint64_t Align, Value *Shadow,
+                   Instruction *Pos);
+};
+
+class DFSanVisitor : public InstVisitor<DFSanVisitor> {
+ public:
+  DFSanFunction &DFSF;
+  DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
+
+  void visitOperandShadowInst(Instruction &I);
+
+  void visitBinaryOperator(BinaryOperator &BO);
+  void visitCastInst(CastInst &CI);
+  void visitCmpInst(CmpInst &CI);
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI);
+  void visitLoadInst(LoadInst &LI);
+  void visitStoreInst(StoreInst &SI);
+  void visitReturnInst(ReturnInst &RI);
+  void visitCallSite(CallSite CS);
+  void visitPHINode(PHINode &PN);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+  void visitExtractValueInst(ExtractValueInst &I);
+  void visitInsertValueInst(InsertValueInst &I);
+  void visitAllocaInst(AllocaInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitMemSetInst(MemSetInst &I);
+  void visitMemTransferInst(MemTransferInst &I);
+};
+
+}
+
+char DataFlowSanitizer::ID;
+INITIALIZE_PASS(DataFlowSanitizer, "dfsan",
+                "DataFlowSanitizer: dynamic data flow analysis.", false, false)
+
+ModulePass *
+llvm::createDataFlowSanitizerPass(const std::vector<std::string> &ABIListFiles,
+                                  void *(*getArgTLS)(),
+                                  void *(*getRetValTLS)()) {
+  return new DataFlowSanitizer(ABIListFiles, getArgTLS, getRetValTLS);
+}
+
+DataFlowSanitizer::DataFlowSanitizer(
+    const std::vector<std::string> &ABIListFiles, void *(*getArgTLS)(),
+    void *(*getRetValTLS)())
+    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS) {
+  std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
+  AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(),
+                         ClABIListFiles.end());
+  ABIList.set(SpecialCaseList::createOrDie(AllABIListFiles));
+}
+
+FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
+  llvm::SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end());
+  ArgTypes.append(T->getNumParams(), ShadowTy);
+  if (T->isVarArg())
+    ArgTypes.push_back(ShadowPtrTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    RetType = StructType::get(RetType, ShadowTy, (Type *)nullptr);
+  return FunctionType::get(RetType, ArgTypes, T->isVarArg());
+}
+
+FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
+  assert(!T->isVarArg());
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  ArgTypes.push_back(T->getPointerTo());
+  ArgTypes.append(T->param_begin(), T->param_end());
+  ArgTypes.append(T->getNumParams(), ShadowTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    ArgTypes.push_back(ShadowPtrTy);
+  return FunctionType::get(T->getReturnType(), ArgTypes, false);
+}
+
+FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  for (FunctionType::param_iterator i = T->param_begin(), e = T->param_end();
+       i != e; ++i) {
+    FunctionType *FT;
+    if (isa<PointerType>(*i) && (FT = dyn_cast<FunctionType>(cast<PointerType>(
+                                     *i)->getElementType()))) {
+      ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
+      ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
+    } else {
+      ArgTypes.push_back(*i);
+    }
+  }
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+    ArgTypes.push_back(ShadowTy);
+  if (T->isVarArg())
+    ArgTypes.push_back(ShadowPtrTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    ArgTypes.push_back(ShadowPtrTy);
+  return FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg());
+}
+
+bool DataFlowSanitizer::doInitialization(Module &M) {
+  llvm::Triple TargetTriple(M.getTargetTriple());
+  bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
+  bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 ||
+                  TargetTriple.getArch() == llvm::Triple::mips64el;
+
+  const DataLayout &DL = M.getDataLayout();
+
+  Mod = &M;
+  Ctx = &M.getContext();
+  ShadowTy = IntegerType::get(*Ctx, ShadowWidth);
+  ShadowPtrTy = PointerType::getUnqual(ShadowTy);
+  IntptrTy = DL.getIntPtrType(*Ctx);
+  ZeroShadow = ConstantInt::getSigned(ShadowTy, 0);
+  ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8);
+  if (IsX86_64)
+    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
+  else if (IsMIPS64)
+    ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
+  else
+    report_fatal_error("unsupported triple");
+
+  Type *DFSanUnionArgs[2] = { ShadowTy, ShadowTy };
+  DFSanUnionFnTy =
+      FunctionType::get(ShadowTy, DFSanUnionArgs, /*isVarArg=*/ false);
+  Type *DFSanUnionLoadArgs[2] = { ShadowPtrTy, IntptrTy };
+  DFSanUnionLoadFnTy =
+      FunctionType::get(ShadowTy, DFSanUnionLoadArgs, /*isVarArg=*/ false);
+  DFSanUnimplementedFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+  Type *DFSanSetLabelArgs[3] = { ShadowTy, Type::getInt8PtrTy(*Ctx), IntptrTy };
+  DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
+                                        DFSanSetLabelArgs, /*isVarArg=*/false);
+  DFSanNonzeroLabelFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), None, /*isVarArg=*/false);
+  DFSanVarargWrapperFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+
+  if (GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = nullptr;
+    GetArgTLS = ConstantExpr::getIntToPtr(
+        ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
+        PointerType::getUnqual(
+            FunctionType::get(PointerType::getUnqual(ArgTLSTy),
+                              (Type *)nullptr)));
+  }
+  if (GetRetvalTLSPtr) {
+    RetvalTLS = nullptr;
+    GetRetvalTLS = ConstantExpr::getIntToPtr(
+        ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
+        PointerType::getUnqual(
+            FunctionType::get(PointerType::getUnqual(ShadowTy),
+                              (Type *)nullptr)));
+  }
+
+  ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
+  return true;
+}
+
+bool DataFlowSanitizer::isInstrumented(const Function *F) {
+  return !ABIList.isIn(*F, "uninstrumented");
+}
+
+bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) {
+  return !ABIList.isIn(*GA, "uninstrumented");
+}
+
+DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
+  return ClArgsABI ? IA_Args : IA_TLS;
+}
+
+DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
+  if (ABIList.isIn(*F, "functional"))
+    return WK_Functional;
+  if (ABIList.isIn(*F, "discard"))
+    return WK_Discard;
+  if (ABIList.isIn(*F, "custom"))
+    return WK_Custom;
+
+  return WK_Warning;
+}
+
+void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) {
+  std::string GVName = GV->getName(), Prefix = "dfs$";
+  GV->setName(Prefix + GVName);
+
+  // Try to change the name of the function in module inline asm.  We only do
+  // this for specific asm directives, currently only ".symver", to try to avoid
+  // corrupting asm which happens to contain the symbol name as a substring.
+  // Note that the substitution for .symver assumes that the versioned symbol
+  // also has an instrumented name.
+  std::string Asm = GV->getParent()->getModuleInlineAsm();
+  std::string SearchStr = ".symver " + GVName + ",";
+  size_t Pos = Asm.find(SearchStr);
+  if (Pos != std::string::npos) {
+    Asm.replace(Pos, SearchStr.size(),
+                ".symver " + Prefix + GVName + "," + Prefix);
+    GV->getParent()->setModuleInlineAsm(Asm);
+  }
+}
+
+Function *
+DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
+                                        GlobalValue::LinkageTypes NewFLink,
+                                        FunctionType *NewFT) {
+  FunctionType *FT = F->getFunctionType();
+  Function *NewF = Function::Create(NewFT, NewFLink, NewFName,
+                                    F->getParent());
+  NewF->copyAttributesFrom(F);
+  NewF->removeAttributes(
+    AttributeSet::ReturnIndex,
+    AttributeSet::get(F->getContext(), AttributeSet::ReturnIndex,
+                    AttributeFuncs::typeIncompatible(NewFT->getReturnType())));
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
+  if (F->isVarArg()) {
+    NewF->removeAttributes(
+        AttributeSet::FunctionIndex,
+        AttributeSet().addAttribute(*Ctx, AttributeSet::FunctionIndex,
+                                    "split-stack"));
+    CallInst::Create(DFSanVarargWrapperFn,
+                     IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
+                     BB);
+    new UnreachableInst(*Ctx, BB);
+  } else {
+    std::vector<Value *> Args;
+    unsigned n = FT->getNumParams();
+    for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n)
+      Args.push_back(&*ai);
+    CallInst *CI = CallInst::Create(F, Args, "", BB);
+    if (FT->getReturnType()->isVoidTy())
+      ReturnInst::Create(*Ctx, BB);
+    else
+      ReturnInst::Create(*Ctx, CI, BB);
+  }
+
+  return NewF;
+}
+
+Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
+                                                          StringRef FName) {
+  FunctionType *FTT = getTrampolineFunctionType(FT);
+  Constant *C = Mod->getOrInsertFunction(FName, FTT);
+  Function *F = dyn_cast<Function>(C);
+  if (F && F->isDeclaration()) {
+    F->setLinkage(GlobalValue::LinkOnceODRLinkage);
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    std::vector<Value *> Args;
+    Function::arg_iterator AI = F->arg_begin(); ++AI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
+      Args.push_back(&*AI);
+    CallInst *CI =
+        CallInst::Create(&F->getArgumentList().front(), Args, "", BB);
+    ReturnInst *RI;
+    if (FT->getReturnType()->isVoidTy())
+      RI = ReturnInst::Create(*Ctx, BB);
+    else
+      RI = ReturnInst::Create(*Ctx, CI, BB);
+
+    DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);
+    Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N)
+      DFSF.ValShadowMap[ValAI] = ShadowAI;
+    DFSanVisitor(DFSF).visitCallInst(*CI);
+    if (!FT->getReturnType()->isVoidTy())
+      new StoreInst(DFSF.getShadow(RI->getReturnValue()),
+                    &F->getArgumentList().back(), RI);
+  }
+
+  return C;
+}
+
+bool DataFlowSanitizer::runOnModule(Module &M) {
+  if (ABIList.isIn(M, "skip"))
+    return false;
+
+  FunctionDIs = makeSubprogramMap(M);
+
+  if (!GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS))
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+  }
+  if (!GetRetvalTLSPtr) {
+    RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS))
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+  }
+
+  DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanUnionFn)) {
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(1, Attribute::ZExt);
+    F->addAttribute(2, Attribute::ZExt);
+  }
+  DFSanCheckedUnionFn = Mod->getOrInsertFunction("dfsan_union", DFSanUnionFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanCheckedUnionFn)) {
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(1, Attribute::ZExt);
+    F->addAttribute(2, Attribute::ZExt);
+  }
+  DFSanUnionLoadFn =
+      Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) {
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly);
+    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+  }
+  DFSanUnimplementedFn =
+      Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
+  DFSanSetLabelFn =
+      Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanSetLabelFn)) {
+    F->addAttribute(1, Attribute::ZExt);
+  }
+  DFSanNonzeroLabelFn =
+      Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
+  DFSanVarargWrapperFn = Mod->getOrInsertFunction("__dfsan_vararg_wrapper",
+                                                  DFSanVarargWrapperFnTy);
+
+  std::vector<Function *> FnsToInstrument;
+  llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI;
+  for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
+    if (!i->isIntrinsic() &&
+        i != DFSanUnionFn &&
+        i != DFSanCheckedUnionFn &&
+        i != DFSanUnionLoadFn &&
+        i != DFSanUnimplementedFn &&
+        i != DFSanSetLabelFn &&
+        i != DFSanNonzeroLabelFn &&
+        i != DFSanVarargWrapperFn)
+      FnsToInstrument.push_back(&*i);
+  }
+
+  // Give function aliases prefixes when necessary, and build wrappers where the
+  // instrumentedness is inconsistent.
+  for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) {
+    GlobalAlias *GA = &*i;
+    ++i;
+    // Don't stop on weak.  We assume people aren't playing games with the
+    // instrumentedness of overridden weak aliases.
+    if (auto F = dyn_cast<Function>(GA->getBaseObject())) {
+      bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
+      if (GAInst && FInst) {
+        addGlobalNamePrefix(GA);
+      } else if (GAInst != FInst) {
+        // Non-instrumented alias of an instrumented function, or vice versa.
+        // Replace the alias with a native-ABI wrapper of the aliasee.  The pass
+        // below will take care of instrumenting it.
+        Function *NewF =
+            buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType());
+        GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA->getType()));
+        NewF->takeName(GA);
+        GA->eraseFromParent();
+        FnsToInstrument.push_back(NewF);
+      }
+    }
+  }
+
+  AttrBuilder B;
+  B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
+  ReadOnlyNoneAttrs = AttributeSet::get(*Ctx, AttributeSet::FunctionIndex, B);
+
+  // First, change the ABI of every function in the module.  ABI-listed
+  // functions keep their original ABI and get a wrapper function.
+  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
+                                         e = FnsToInstrument.end();
+       i != e; ++i) {
+    Function &F = **i;
+    FunctionType *FT = F.getFunctionType();
+
+    bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() &&
+                              FT->getReturnType()->isVoidTy());
+
+    if (isInstrumented(&F)) {
+      // Instrumented functions get a 'dfs$' prefix.  This allows us to more
+      // easily identify cases of mismatching ABIs.
+      if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) {
+        FunctionType *NewFT = getArgsFunctionType(FT);
+        Function *NewF = Function::Create(NewFT, F.getLinkage(), "", &M);
+        NewF->copyAttributesFrom(&F);
+        NewF->removeAttributes(
+          AttributeSet::ReturnIndex,
+          AttributeSet::get(NewF->getContext(), AttributeSet::ReturnIndex,
+                    AttributeFuncs::typeIncompatible(NewFT->getReturnType())));
+        for (Function::arg_iterator FArg = F.arg_begin(),
+                                    NewFArg = NewF->arg_begin(),
+                                    FArgEnd = F.arg_end();
+             FArg != FArgEnd; ++FArg, ++NewFArg) {
+          FArg->replaceAllUsesWith(NewFArg);
+        }
+        NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList());
+
+        for (Function::user_iterator UI = F.user_begin(), UE = F.user_end();
+             UI != UE;) {
+          BlockAddress *BA = dyn_cast<BlockAddress>(*UI);
+          ++UI;
+          if (BA) {
+            BA->replaceAllUsesWith(
+                BlockAddress::get(NewF, BA->getBasicBlock()));
+            delete BA;
+          }
+        }
+        F.replaceAllUsesWith(
+            ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)));
+        NewF->takeName(&F);
+        F.eraseFromParent();
+        *i = NewF;
+        addGlobalNamePrefix(NewF);
+      } else {
+        addGlobalNamePrefix(&F);
+      }
+    } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) {
+      // Build a wrapper function for F.  The wrapper simply calls F, and is
+      // added to FnsToInstrument so that any instrumentation according to its
+      // WrapperKind is done in the second pass below.
+      FunctionType *NewFT = getInstrumentedABI() == IA_Args
+                                ? getArgsFunctionType(FT)
+                                : FT;
+      Function *NewF = buildWrapperFunction(
+          &F, std::string("dfsw$") + std::string(F.getName()),
+          GlobalValue::LinkOnceODRLinkage, NewFT);
+      if (getInstrumentedABI() == IA_TLS)
+        NewF->removeAttributes(AttributeSet::FunctionIndex, ReadOnlyNoneAttrs);
+
+      Value *WrappedFnCst =
+          ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
+      F.replaceAllUsesWith(WrappedFnCst);
+
+      // Patch the pointer to LLVM function in debug info descriptor.
+      auto DI = FunctionDIs.find(&F);
+      if (DI != FunctionDIs.end())
+        DI->second->replaceFunction(&F);
+
+      UnwrappedFnMap[WrappedFnCst] = &F;
+      *i = NewF;
+
+      if (!F.isDeclaration()) {
+        // This function is probably defining an interposition of an
+        // uninstrumented function and hence needs to keep the original ABI.
+        // But any functions it may call need to use the instrumented ABI, so
+        // we instrument it in a mode which preserves the original ABI.
+        FnsWithNativeABI.insert(&F);
+
+        // This code needs to rebuild the iterators, as they may be invalidated
+        // by the push_back, taking care that the new range does not include
+        // any functions added by this code.
+        size_t N = i - FnsToInstrument.begin(),
+               Count = e - FnsToInstrument.begin();
+        FnsToInstrument.push_back(&F);
+        i = FnsToInstrument.begin() + N;
+        e = FnsToInstrument.begin() + Count;
+      }
+               // Hopefully, nobody will try to indirectly call a vararg
+               // function... yet.
+    } else if (FT->isVarArg()) {
+      UnwrappedFnMap[&F] = &F;
+      *i = nullptr;
+    }
+  }
+
+  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
+                                         e = FnsToInstrument.end();
+       i != e; ++i) {
+    if (!*i || (*i)->isDeclaration())
+      continue;
+
+    removeUnreachableBlocks(**i);
+
+    DFSanFunction DFSF(*this, *i, FnsWithNativeABI.count(*i));
+
+    // DFSanVisitor may create new basic blocks, which confuses df_iterator.
+    // Build a copy of the list before iterating over it.
+    llvm::SmallVector<BasicBlock *, 4> BBList(
+        depth_first(&(*i)->getEntryBlock()));
+
+    for (llvm::SmallVector<BasicBlock *, 4>::iterator i = BBList.begin(),
+                                                      e = BBList.end();
+         i != e; ++i) {
+      Instruction *Inst = &(*i)->front();
+      while (1) {
+        // DFSanVisitor may split the current basic block, changing the current
+        // instruction's next pointer and moving the next instruction to the
+        // tail block from which we should continue.
+        Instruction *Next = Inst->getNextNode();
+        // DFSanVisitor may delete Inst, so keep track of whether it was a
+        // terminator.
+        bool IsTerminator = isa<TerminatorInst>(Inst);
+        if (!DFSF.SkipInsts.count(Inst))
+          DFSanVisitor(DFSF).visit(Inst);
+        if (IsTerminator)
+          break;
+        Inst = Next;
+      }
+    }
+
+    // We will not necessarily be able to compute the shadow for every phi node
+    // until we have visited every block.  Therefore, the code that handles phi
+    // nodes adds them to the PHIFixups list so that they can be properly
+    // handled here.
+    for (std::vector<std::pair<PHINode *, PHINode *> >::iterator
+             i = DFSF.PHIFixups.begin(),
+             e = DFSF.PHIFixups.end();
+         i != e; ++i) {
+      for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n;
+           ++val) {
+        i->second->setIncomingValue(
+            val, DFSF.getShadow(i->first->getIncomingValue(val)));
+      }
+    }
+
+    // -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy
+    // places (i.e. instructions in basic blocks we haven't even begun visiting
+    // yet).  To make our life easier, do this work in a pass after the main
+    // instrumentation.
+    if (ClDebugNonzeroLabels) {
+      for (Value *V : DFSF.NonZeroChecks) {
+        Instruction *Pos;
+        if (Instruction *I = dyn_cast<Instruction>(V))
+          Pos = I->getNextNode();
+        else
+          Pos = DFSF.F->getEntryBlock().begin();
+        while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))
+          Pos = Pos->getNextNode();
+        IRBuilder<> IRB(Pos);
+        Value *Ne = IRB.CreateICmpNE(V, DFSF.DFS.ZeroShadow);
+        BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+            Ne, Pos, /*Unreachable=*/false, ColdCallWeights));
+        IRBuilder<> ThenIRB(BI);
+        ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn, {});
+      }
+    }
+  }
+
+  return false;
+}
+
+Value *DFSanFunction::getArgTLSPtr() {
+  if (ArgTLSPtr)
+    return ArgTLSPtr;
+  if (DFS.ArgTLS)
+    return ArgTLSPtr = DFS.ArgTLS;
+
+  IRBuilder<> IRB(F->getEntryBlock().begin());
+  return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS, {});
+}
+
+Value *DFSanFunction::getRetvalTLS() {
+  if (RetvalTLSPtr)
+    return RetvalTLSPtr;
+  if (DFS.RetvalTLS)
+    return RetvalTLSPtr = DFS.RetvalTLS;
+
+  IRBuilder<> IRB(F->getEntryBlock().begin());
+  return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS, {});
+}
+
+Value *DFSanFunction::getArgTLS(unsigned Idx, Instruction *Pos) {
+  IRBuilder<> IRB(Pos);
+  return IRB.CreateConstGEP2_64(getArgTLSPtr(), 0, Idx);
+}
+
+Value *DFSanFunction::getShadow(Value *V) {
+  if (!isa<Argument>(V) && !isa<Instruction>(V))
+    return DFS.ZeroShadow;
+  Value *&Shadow = ValShadowMap[V];
+  if (!Shadow) {
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      if (IsNativeABI)
+        return DFS.ZeroShadow;
+      switch (IA) {
+      case DataFlowSanitizer::IA_TLS: {
+        Value *ArgTLSPtr = getArgTLSPtr();
+        Instruction *ArgTLSPos =
+            DFS.ArgTLS ? &*F->getEntryBlock().begin()
+                       : cast<Instruction>(ArgTLSPtr)->getNextNode();
+        IRBuilder<> IRB(ArgTLSPos);
+        Shadow = IRB.CreateLoad(getArgTLS(A->getArgNo(), ArgTLSPos));
+        break;
+      }
+      case DataFlowSanitizer::IA_Args: {
+        unsigned ArgIdx = A->getArgNo() + F->getArgumentList().size() / 2;
+        Function::arg_iterator i = F->arg_begin();
+        while (ArgIdx--)
+          ++i;
+        Shadow = i;
+        assert(Shadow->getType() == DFS.ShadowTy);
+        break;
+      }
+      }
+      NonZeroChecks.push_back(Shadow);
+    } else {
+      Shadow = DFS.ZeroShadow;
+    }
+  }
+  return Shadow;
+}
+
+void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {
+  assert(!ValShadowMap.count(I));
+  assert(Shadow->getType() == DFS.ShadowTy);
+  ValShadowMap[I] = Shadow;
+}
+
+Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
+  assert(Addr != RetvalTLS && "Reinstrumenting?");
+  IRBuilder<> IRB(Pos);
+  return IRB.CreateIntToPtr(
+      IRB.CreateMul(
+          IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), ShadowPtrMask),
+          ShadowPtrMul),
+      ShadowPtrTy);
+}
+
+// Generates IR to compute the union of the two given shadows, inserting it
+// before Pos.  Returns the computed union Value.
+Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
+  if (V1 == DFS.ZeroShadow)
+    return V2;
+  if (V2 == DFS.ZeroShadow)
+    return V1;
+  if (V1 == V2)
+    return V1;
+
+  auto V1Elems = ShadowElements.find(V1);
+  auto V2Elems = ShadowElements.find(V2);
+  if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) {
+    if (std::includes(V1Elems->second.begin(), V1Elems->second.end(),
+                      V2Elems->second.begin(), V2Elems->second.end())) {
+      return V1;
+    } else if (std::includes(V2Elems->second.begin(), V2Elems->second.end(),
+                             V1Elems->second.begin(), V1Elems->second.end())) {
+      return V2;
+    }
+  } else if (V1Elems != ShadowElements.end()) {
+    if (V1Elems->second.count(V2))
+      return V1;
+  } else if (V2Elems != ShadowElements.end()) {
+    if (V2Elems->second.count(V1))
+      return V2;
+  }
+
+  auto Key = std::make_pair(V1, V2);
+  if (V1 > V2)
+    std::swap(Key.first, Key.second);
+  CachedCombinedShadow &CCS = CachedCombinedShadows[Key];
+  if (CCS.Block && DT.dominates(CCS.Block, Pos->getParent()))
+    return CCS.Shadow;
+
+  IRBuilder<> IRB(Pos);
+  if (AvoidNewBlocks) {
+    CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {V1, V2});
+    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    Call->addAttribute(1, Attribute::ZExt);
+    Call->addAttribute(2, Attribute::ZExt);
+
+    CCS.Block = Pos->getParent();
+    CCS.Shadow = Call;
+  } else {
+    BasicBlock *Head = Pos->getParent();
+    Value *Ne = IRB.CreateICmpNE(V1, V2);
+    BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+        Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT));
+    IRBuilder<> ThenIRB(BI);
+    CallInst *Call = ThenIRB.CreateCall(DFS.DFSanUnionFn, {V1, V2});
+    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    Call->addAttribute(1, Attribute::ZExt);
+    Call->addAttribute(2, Attribute::ZExt);
+
+    BasicBlock *Tail = BI->getSuccessor(0);
+    PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", Tail->begin());
+    Phi->addIncoming(Call, Call->getParent());
+    Phi->addIncoming(V1, Head);
+
+    CCS.Block = Tail;
+    CCS.Shadow = Phi;
+  }
+
+  std::set<Value *> UnionElems;
+  if (V1Elems != ShadowElements.end()) {
+    UnionElems = V1Elems->second;
+  } else {
+    UnionElems.insert(V1);
+  }
+  if (V2Elems != ShadowElements.end()) {
+    UnionElems.insert(V2Elems->second.begin(), V2Elems->second.end());
+  } else {
+    UnionElems.insert(V2);
+  }
+  ShadowElements[CCS.Shadow] = std::move(UnionElems);
+
+  return CCS.Shadow;
+}
+
+// A convenience function which folds the shadows of each of the operands
+// of the provided instruction Inst, inserting the IR before Inst.  Returns
+// the computed union Value.
+Value *DFSanFunction::combineOperandShadows(Instruction *Inst) {
+  if (Inst->getNumOperands() == 0)
+    return DFS.ZeroShadow;
+
+  Value *Shadow = getShadow(Inst->getOperand(0));
+  for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) {
+    Shadow = combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
+  }
+  return Shadow;
+}
+
+void DFSanVisitor::visitOperandShadowInst(Instruction &I) {
+  Value *CombinedShadow = DFSF.combineOperandShadows(&I);
+  DFSF.setShadow(&I, CombinedShadow);
+}
+
+// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
+// Addr has alignment Align, and take the union of each of those shadows.
+Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                 Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i =
+        AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
+      return IRB.CreateLoad(i->second);
+    }
+  }
+
+  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  SmallVector<Value *, 2> Objs;
+  GetUnderlyingObjects(Addr, Objs, Pos->getModule()->getDataLayout());
+  bool AllConstants = true;
+  for (SmallVector<Value *, 2>::iterator i = Objs.begin(), e = Objs.end();
+       i != e; ++i) {
+    if (isa<Function>(*i) || isa<BlockAddress>(*i))
+      continue;
+    if (isa<GlobalVariable>(*i) && cast<GlobalVariable>(*i)->isConstant())
+      continue;
+
+    AllConstants = false;
+    break;
+  }
+  if (AllConstants)
+    return DFS.ZeroShadow;
+
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  switch (Size) {
+  case 0:
+    return DFS.ZeroShadow;
+  case 1: {
+    LoadInst *LI = new LoadInst(ShadowAddr, "", Pos);
+    LI->setAlignment(ShadowAlign);
+    return LI;
+  }
+  case 2: {
+    IRBuilder<> IRB(Pos);
+    Value *ShadowAddr1 = IRB.CreateGEP(DFS.ShadowTy, ShadowAddr,
+                                       ConstantInt::get(DFS.IntptrTy, 1));
+    return combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign),
+                          IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign), Pos);
+  }
+  }
+  if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidth) == 0) {
+    // Fast path for the common case where each byte has identical shadow: load
+    // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
+    // shadow is non-equal.
+    BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F);
+    IRBuilder<> FallbackIRB(FallbackBB);
+    CallInst *FallbackCall = FallbackIRB.CreateCall(
+        DFS.DFSanUnionLoadFn,
+        {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
+    FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+
+    // Compare each of the shadows stored in the loaded 64 bits to each other,
+    // by computing (WideShadow rotl ShadowWidth) == WideShadow.
+    IRBuilder<> IRB(Pos);
+    Value *WideAddr =
+        IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+    Value *WideShadow = IRB.CreateAlignedLoad(WideAddr, ShadowAlign);
+    Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.ShadowTy);
+    Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidth);
+    Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidth);
+    Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
+    Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
+
+    BasicBlock *Head = Pos->getParent();
+    BasicBlock *Tail = Head->splitBasicBlock(Pos);
+
+    if (DomTreeNode *OldNode = DT.getNode(Head)) {
+      std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+      DomTreeNode *NewNode = DT.addNewBlock(Tail, Head);
+      for (auto Child : Children)
+        DT.changeImmediateDominator(Child, NewNode);
+    }
+
+    // In the following code LastBr will refer to the previous basic block's
+    // conditional branch instruction, whose true successor is fixed up to point
+    // to the next block during the loop below or to the tail after the final
+    // iteration.
+    BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq);
+    ReplaceInstWithInst(Head->getTerminator(), LastBr);
+    DT.addNewBlock(FallbackBB, Head);
+
+    for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size;
+         Ofs += 64 / DFS.ShadowWidth) {
+      BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
+      DT.addNewBlock(NextBB, LastBr->getParent());
+      IRBuilder<> NextIRB(NextBB);
+      WideAddr = NextIRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
+                                   ConstantInt::get(DFS.IntptrTy, 1));
+      Value *NextWideShadow = NextIRB.CreateAlignedLoad(WideAddr, ShadowAlign);
+      ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow);
+      LastBr->setSuccessor(0, NextBB);
+      LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB);
+    }
+
+    LastBr->setSuccessor(0, Tail);
+    FallbackIRB.CreateBr(Tail);
+    PHINode *Shadow = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
+    Shadow->addIncoming(FallbackCall, FallbackBB);
+    Shadow->addIncoming(TruncShadow, LastBr->getParent());
+    return Shadow;
+  }
+
+  IRBuilder<> IRB(Pos);
+  CallInst *FallbackCall = IRB.CreateCall(
+      DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
+  FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+  return FallbackCall;
+}
+
+void DFSanVisitor::visitLoadInst(LoadInst &LI) {
+  auto &DL = LI.getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeStoreSize(LI.getType());
+  if (Size == 0) {
+    DFSF.setShadow(&LI, DFSF.DFS.ZeroShadow);
+    return;
+  }
+
+  uint64_t Align;
+  if (ClPreserveAlignment) {
+    Align = LI.getAlignment();
+    if (Align == 0)
+      Align = DL.getABITypeAlignment(LI.getType());
+  } else {
+    Align = 1;
+  }
+  IRBuilder<> IRB(&LI);
+  Value *Shadow = DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI);
+  if (ClCombinePointerLabelsOnLoad) {
+    Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
+    Shadow = DFSF.combineShadows(Shadow, PtrShadow, &LI);
+  }
+  if (Shadow != DFSF.DFS.ZeroShadow)
+    DFSF.NonZeroChecks.push_back(Shadow);
+
+  DFSF.setShadow(&LI, Shadow);
+}
+
+void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                Value *Shadow, Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i =
+        AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
+      IRB.CreateStore(Shadow, i->second);
+      return;
+    }
+  }
+
+  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  IRBuilder<> IRB(Pos);
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  if (Shadow == DFS.ZeroShadow) {
+    IntegerType *ShadowTy = IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidth);
+    Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
+    Value *ExtShadowAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy));
+    IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign);
+    return;
+  }
+
+  const unsigned ShadowVecSize = 128 / DFS.ShadowWidth;
+  uint64_t Offset = 0;
+  if (Size >= ShadowVecSize) {
+    VectorType *ShadowVecTy = VectorType::get(DFS.ShadowTy, ShadowVecSize);
+    Value *ShadowVec = UndefValue::get(ShadowVecTy);
+    for (unsigned i = 0; i != ShadowVecSize; ++i) {
+      ShadowVec = IRB.CreateInsertElement(
+          ShadowVec, Shadow, ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i));
+    }
+    Value *ShadowVecAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy));
+    do {
+      Value *CurShadowVecAddr =
+          IRB.CreateConstGEP1_32(ShadowVecTy, ShadowVecAddr, Offset);
+      IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign);
+      Size -= ShadowVecSize;
+      ++Offset;
+    } while (Size >= ShadowVecSize);
+    Offset *= ShadowVecSize;
+  }
+  while (Size > 0) {
+    Value *CurShadowAddr =
+        IRB.CreateConstGEP1_32(DFS.ShadowTy, ShadowAddr, Offset);
+    IRB.CreateAlignedStore(Shadow, CurShadowAddr, ShadowAlign);
+    --Size;
+    ++Offset;
+  }
+}
+
+void DFSanVisitor::visitStoreInst(StoreInst &SI) {
+  auto &DL = SI.getModule()->getDataLayout();
+  uint64_t Size = DL.getTypeStoreSize(SI.getValueOperand()->getType());
+  if (Size == 0)
+    return;
+
+  uint64_t Align;
+  if (ClPreserveAlignment) {
+    Align = SI.getAlignment();
+    if (Align == 0)
+      Align = DL.getABITypeAlignment(SI.getValueOperand()->getType());
+  } else {
+    Align = 1;
+  }
+
+  Value* Shadow = DFSF.getShadow(SI.getValueOperand());
+  if (ClCombinePointerLabelsOnStore) {
+    Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand());
+    Shadow = DFSF.combineShadows(Shadow, PtrShadow, &SI);
+  }
+  DFSF.storeShadow(SI.getPointerOperand(), Size, Align, Shadow, &SI);
+}
+
+void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) {
+  visitOperandShadowInst(BO);
+}
+
+void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitCmpInst(CmpInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  visitOperandShadowInst(GEPI);
+}
+
+void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitAllocaInst(AllocaInst &I) {
+  bool AllLoadsStores = true;
+  for (User *U : I.users()) {
+    if (isa<LoadInst>(U))
+      continue;
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getPointerOperand() == &I)
+        continue;
+    }
+
+    AllLoadsStores = false;
+    break;
+  }
+  if (AllLoadsStores) {
+    IRBuilder<> IRB(&I);
+    DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.ShadowTy);
+  }
+  DFSF.setShadow(&I, DFSF.DFS.ZeroShadow);
+}
+
+void DFSanVisitor::visitSelectInst(SelectInst &I) {
+  Value *CondShadow = DFSF.getShadow(I.getCondition());
+  Value *TrueShadow = DFSF.getShadow(I.getTrueValue());
+  Value *FalseShadow = DFSF.getShadow(I.getFalseValue());
+
+  if (isa<VectorType>(I.getCondition()->getType())) {
+    DFSF.setShadow(
+        &I,
+        DFSF.combineShadows(
+            CondShadow, DFSF.combineShadows(TrueShadow, FalseShadow, &I), &I));
+  } else {
+    Value *ShadowSel;
+    if (TrueShadow == FalseShadow) {
+      ShadowSel = TrueShadow;
+    } else {
+      ShadowSel =
+          SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I);
+    }
+    DFSF.setShadow(&I, DFSF.combineShadows(CondShadow, ShadowSel, &I));
+  }
+}
+
+void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *ValShadow = DFSF.getShadow(I.getValue());
+  IRB.CreateCall(DFSF.DFS.DFSanSetLabelFn,
+                 {ValShadow, IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(
+                                                                *DFSF.DFS.Ctx)),
+                  IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)});
+}
+
+void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *DestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
+  Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I);
+  Value *LenShadow = IRB.CreateMul(
+      I.getLength(),
+      ConstantInt::get(I.getLength()->getType(), DFSF.DFS.ShadowWidth / 8));
+  Value *AlignShadow;
+  if (ClPreserveAlignment) {
+    AlignShadow = IRB.CreateMul(I.getAlignmentCst(),
+                                ConstantInt::get(I.getAlignmentCst()->getType(),
+                                                 DFSF.DFS.ShadowWidth / 8));
+  } else {
+    AlignShadow = ConstantInt::get(I.getAlignmentCst()->getType(),
+                                   DFSF.DFS.ShadowWidth / 8);
+  }
+  Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
+  DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr);
+  SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
+  IRB.CreateCall(I.getCalledValue(), {DestShadow, SrcShadow, LenShadow,
+                                      AlignShadow, I.getVolatileCst()});
+}
+
+void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
+  if (!DFSF.IsNativeABI && RI.getReturnValue()) {
+    switch (DFSF.IA) {
+    case DataFlowSanitizer::IA_TLS: {
+      Value *S = DFSF.getShadow(RI.getReturnValue());
+      IRBuilder<> IRB(&RI);
+      IRB.CreateStore(S, DFSF.getRetvalTLS());
+      break;
+    }
+    case DataFlowSanitizer::IA_Args: {
+      IRBuilder<> IRB(&RI);
+      Type *RT = DFSF.F->getFunctionType()->getReturnType();
+      Value *InsVal =
+          IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0);
+      Value *InsShadow =
+          IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1);
+      RI.setOperand(0, InsShadow);
+      break;
+    }
+    }
+  }
+}
+
+void DFSanVisitor::visitCallSite(CallSite CS) {
+  Function *F = CS.getCalledFunction();
+  if ((F && F->isIntrinsic()) || isa<InlineAsm>(CS.getCalledValue())) {
+    visitOperandShadowInst(*CS.getInstruction());
+    return;
+  }
+
+  // Calls to this function are synthesized in wrappers, and we shouldn't
+  // instrument them.
+  if (F == DFSF.DFS.DFSanVarargWrapperFn)
+    return;
+
+  assert(!(cast<FunctionType>(
+      CS.getCalledValue()->getType()->getPointerElementType())->isVarArg() &&
+           dyn_cast<InvokeInst>(CS.getInstruction())));
+
+  IRBuilder<> IRB(CS.getInstruction());
+
+  DenseMap<Value *, Function *>::iterator i =
+      DFSF.DFS.UnwrappedFnMap.find(CS.getCalledValue());
+  if (i != DFSF.DFS.UnwrappedFnMap.end()) {
+    Function *F = i->second;
+    switch (DFSF.DFS.getWrapperKind(F)) {
+    case DataFlowSanitizer::WK_Warning: {
+      CS.setCalledFunction(F);
+      IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
+                     IRB.CreateGlobalStringPtr(F->getName()));
+      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      return;
+    }
+    case DataFlowSanitizer::WK_Discard: {
+      CS.setCalledFunction(F);
+      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      return;
+    }
+    case DataFlowSanitizer::WK_Functional: {
+      CS.setCalledFunction(F);
+      visitOperandShadowInst(*CS.getInstruction());
+      return;
+    }
+    case DataFlowSanitizer::WK_Custom: {
+      // Don't try to handle invokes of custom functions, it's too complicated.
+      // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_
+      // wrapper.
+      if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
+        FunctionType *FT = F->getFunctionType();
+        FunctionType *CustomFT = DFSF.DFS.getCustomFunctionType(FT);
+        std::string CustomFName = "__dfsw_";
+        CustomFName += F->getName();
+        Constant *CustomF =
+            DFSF.DFS.Mod->getOrInsertFunction(CustomFName, CustomFT);
+        if (Function *CustomFn = dyn_cast<Function>(CustomF)) {
+          CustomFn->copyAttributesFrom(F);
+
+          // Custom functions returning non-void will write to the return label.
+          if (!FT->getReturnType()->isVoidTy()) {
+            CustomFn->removeAttributes(AttributeSet::FunctionIndex,
+                                       DFSF.DFS.ReadOnlyNoneAttrs);
+          }
+        }
+
+        std::vector<Value *> Args;
+
+        CallSite::arg_iterator i = CS.arg_begin();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) {
+          Type *T = (*i)->getType();
+          FunctionType *ParamFT;
+          if (isa<PointerType>(T) &&
+              (ParamFT = dyn_cast<FunctionType>(
+                   cast<PointerType>(T)->getElementType()))) {
+            std::string TName = "dfst";
+            TName += utostr(FT->getNumParams() - n);
+            TName += "$";
+            TName += F->getName();
+            Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName);
+            Args.push_back(T);
+            Args.push_back(
+                IRB.CreateBitCast(*i, Type::getInt8PtrTy(*DFSF.DFS.Ctx)));
+          } else {
+            Args.push_back(*i);
+          }
+        }
+
+        i = CS.arg_begin();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+          Args.push_back(DFSF.getShadow(*i));
+
+        if (FT->isVarArg()) {
+          auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy,
+                                           CS.arg_size() - FT->getNumParams());
+          auto *LabelVAAlloca = new AllocaInst(LabelVATy, "labelva",
+                                               DFSF.F->getEntryBlock().begin());
+
+          for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) {
+            auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
+            IRB.CreateStore(DFSF.getShadow(*i), LabelVAPtr);
+          }
+
+          Args.push_back(IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, 0));
+        }
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          if (!DFSF.LabelReturnAlloca) {
+            DFSF.LabelReturnAlloca =
+                new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn",
+                               DFSF.F->getEntryBlock().begin());
+          }
+          Args.push_back(DFSF.LabelReturnAlloca);
+        }
+
+        for (i = CS.arg_begin() + FT->getNumParams(); i != CS.arg_end(); ++i)
+          Args.push_back(*i);
+
+        CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
+        CustomCI->setCallingConv(CI->getCallingConv());
+        CustomCI->setAttributes(CI->getAttributes());
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          LoadInst *LabelLoad = IRB.CreateLoad(DFSF.LabelReturnAlloca);
+          DFSF.setShadow(CustomCI, LabelLoad);
+        }
+
+        CI->replaceAllUsesWith(CustomCI);
+        CI->eraseFromParent();
+        return;
+      }
+      break;
+    }
+    }
+  }
+
+  FunctionType *FT = cast<FunctionType>(
+      CS.getCalledValue()->getType()->getPointerElementType());
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+    for (unsigned i = 0, n = FT->getNumParams(); i != n; ++i) {
+      IRB.CreateStore(DFSF.getShadow(CS.getArgument(i)),
+                      DFSF.getArgTLS(i, CS.getInstruction()));
+    }
+  }
+
+  Instruction *Next = nullptr;
+  if (!CS.getType()->isVoidTy()) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      if (II->getNormalDest()->getSinglePredecessor()) {
+        Next = II->getNormalDest()->begin();
+      } else {
+        BasicBlock *NewBB =
+            SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT);
+        Next = NewBB->begin();
+      }
+    } else {
+      Next = CS->getNextNode();
+    }
+
+    if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+      IRBuilder<> NextIRB(Next);
+      LoadInst *LI = NextIRB.CreateLoad(DFSF.getRetvalTLS());
+      DFSF.SkipInsts.insert(LI);
+      DFSF.setShadow(CS.getInstruction(), LI);
+      DFSF.NonZeroChecks.push_back(LI);
+    }
+  }
+
+  // Do all instrumentation for IA_Args down here to defer tampering with the
+  // CFG in a way that SplitEdge may be able to detect.
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) {
+    FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT);
+    Value *Func =
+        IRB.CreateBitCast(CS.getCalledValue(), PointerType::getUnqual(NewFT));
+    std::vector<Value *> Args;
+
+    CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(*i);
+
+    i = CS.arg_begin();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(DFSF.getShadow(*i));
+
+    if (FT->isVarArg()) {
+      unsigned VarArgSize = CS.arg_size() - FT->getNumParams();
+      ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
+      AllocaInst *VarArgShadow =
+          new AllocaInst(VarArgArrayTy, "", DFSF.F->getEntryBlock().begin());
+      Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
+      for (unsigned n = 0; i != e; ++i, ++n) {
+        IRB.CreateStore(
+            DFSF.getShadow(*i),
+            IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, n));
+        Args.push_back(*i);
+      }
+    }
+
+    CallSite NewCS;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      NewCS = IRB.CreateInvoke(Func, II->getNormalDest(), II->getUnwindDest(),
+                               Args);
+    } else {
+      NewCS = IRB.CreateCall(Func, Args);
+    }
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(CS.getAttributes().removeAttributes(
+        *DFSF.DFS.Ctx, AttributeSet::ReturnIndex,
+        AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType())));
+
+    if (Next) {
+      ExtractValueInst *ExVal =
+          ExtractValueInst::Create(NewCS.getInstruction(), 0, "", Next);
+      DFSF.SkipInsts.insert(ExVal);
+      ExtractValueInst *ExShadow =
+          ExtractValueInst::Create(NewCS.getInstruction(), 1, "", Next);
+      DFSF.SkipInsts.insert(ExShadow);
+      DFSF.setShadow(ExVal, ExShadow);
+      DFSF.NonZeroChecks.push_back(ExShadow);
+
+      CS.getInstruction()->replaceAllUsesWith(ExVal);
+    }
+
+    CS.getInstruction()->eraseFromParent();
+  }
+}
+
+void DFSanVisitor::visitPHINode(PHINode &PN) {
+  PHINode *ShadowPN =
+      PHINode::Create(DFSF.DFS.ShadowTy, PN.getNumIncomingValues(), "", &PN);
+
+  // Give the shadow phi node valid predecessors to fool SplitEdge into working.
+  Value *UndefShadow = UndefValue::get(DFSF.DFS.ShadowTy);
+  for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e;
+       ++i) {
+    ShadowPN->addIncoming(UndefShadow, *i);
+  }
+
+  DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN));
+  DFSF.setShadow(&PN, ShadowPN);
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
new file mode 100644
index 0000000..9a3ed5c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -0,0 +1,978 @@
+//===- GCOVProfiling.cpp - Insert edge counters for gcov profiling --------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements GCOV-style profiling. When this pass is run it emits
+// "gcno" files next to the existing source, and instruments the code that runs
+// to records the edges between blocks that run and emit a complementary "gcda"
+// file on exit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "insert-gcov-profiling"
+
+static cl::opt<std::string>
+DefaultGCOVVersion("default-gcov-version", cl::init("402*"), cl::Hidden,
+                   cl::ValueRequired);
+static cl::opt<bool> DefaultExitBlockBeforeBody("gcov-exit-block-before-body",
+                                                cl::init(false), cl::Hidden);
+
+GCOVOptions GCOVOptions::getDefault() {
+  GCOVOptions Options;
+  Options.EmitNotes = true;
+  Options.EmitData = true;
+  Options.UseCfgChecksum = false;
+  Options.NoRedZone = false;
+  Options.FunctionNamesInData = true;
+  Options.ExitBlockBeforeBody = DefaultExitBlockBeforeBody;
+
+  if (DefaultGCOVVersion.size() != 4) {
+    llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
+                             DefaultGCOVVersion);
+  }
+  memcpy(Options.Version, DefaultGCOVVersion.c_str(), 4);
+  return Options;
+}
+
+namespace {
+  class GCOVFunction;
+
+  class GCOVProfiler : public ModulePass {
+  public:
+    static char ID;
+    GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
+    GCOVProfiler(const GCOVOptions &Opts) : ModulePass(ID), Options(Opts) {
+      assert((Options.EmitNotes || Options.EmitData) &&
+             "GCOVProfiler asked to do nothing?");
+      ReversedVersion[0] = Options.Version[3];
+      ReversedVersion[1] = Options.Version[2];
+      ReversedVersion[2] = Options.Version[1];
+      ReversedVersion[3] = Options.Version[0];
+      ReversedVersion[4] = '\0';
+      initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
+    }
+    const char *getPassName() const override {
+      return "GCOV Profiler";
+    }
+
+  private:
+    bool runOnModule(Module &M) override;
+
+    // Create the .gcno files for the Module based on DebugInfo.
+    void emitProfileNotes();
+
+    // Modify the program to track transitions along edges and call into the
+    // profiling runtime to emit .gcda files when run.
+    bool emitProfileArcs();
+
+    // Get pointers to the functions in the runtime library.
+    Constant *getStartFileFunc();
+    Constant *getIncrementIndirectCounterFunc();
+    Constant *getEmitFunctionFunc();
+    Constant *getEmitArcsFunc();
+    Constant *getSummaryInfoFunc();
+    Constant *getDeleteWriteoutFunctionListFunc();
+    Constant *getDeleteFlushFunctionListFunc();
+    Constant *getEndFileFunc();
+
+    // Create or retrieve an i32 state value that is used to represent the
+    // pred block number for certain non-trivial edges.
+    GlobalVariable *getEdgeStateValue();
+
+    // Produce a table of pointers to counters, by predecessor and successor
+    // block number.
+    GlobalVariable *buildEdgeLookupTable(Function *F,
+                                         GlobalVariable *Counter,
+                                         const UniqueVector<BasicBlock *>&Preds,
+                                         const UniqueVector<BasicBlock*>&Succs);
+
+    // Add the function to write out all our counters to the global destructor
+    // list.
+    Function *insertCounterWriteout(ArrayRef<std::pair<GlobalVariable*,
+                                                       MDNode*> >);
+    Function *insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
+    void insertIndirectCounterIncrement();
+
+    std::string mangleName(const DICompileUnit *CU, const char *NewStem);
+
+    GCOVOptions Options;
+
+    // Reversed, NUL-terminated copy of Options.Version.
+    char ReversedVersion[5];
+    // Checksum, produced by hash of EdgeDestinations
+    SmallVector<uint32_t, 4> FileChecksums;
+
+    Module *M;
+    LLVMContext *Ctx;
+    SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
+  };
+}
+
+char GCOVProfiler::ID = 0;
+INITIALIZE_PASS(GCOVProfiler, "insert-gcov-profiling",
+                "Insert instrumentation for GCOV profiling", false, false)
+
+ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
+  return new GCOVProfiler(Options);
+}
+
+static StringRef getFunctionName(const DISubprogram *SP) {
+  if (!SP->getLinkageName().empty())
+    return SP->getLinkageName();
+  return SP->getName();
+}
+
+namespace {
+  class GCOVRecord {
+   protected:
+    static const char *const LinesTag;
+    static const char *const FunctionTag;
+    static const char *const BlockTag;
+    static const char *const EdgeTag;
+
+    GCOVRecord() = default;
+
+    void writeBytes(const char *Bytes, int Size) {
+      os->write(Bytes, Size);
+    }
+
+    void write(uint32_t i) {
+      writeBytes(reinterpret_cast<char*>(&i), 4);
+    }
+
+    // Returns the length measured in 4-byte blocks that will be used to
+    // represent this string in a GCOV file
+    static unsigned lengthOfGCOVString(StringRef s) {
+      // A GCOV string is a length, followed by a NUL, then between 0 and 3 NULs
+      // padding out to the next 4-byte word. The length is measured in 4-byte
+      // words including padding, not bytes of actual string.
+      return (s.size() / 4) + 1;
+    }
+
+    void writeGCOVString(StringRef s) {
+      uint32_t Len = lengthOfGCOVString(s);
+      write(Len);
+      writeBytes(s.data(), s.size());
+
+      // Write 1 to 4 bytes of NUL padding.
+      assert((unsigned)(4 - (s.size() % 4)) > 0);
+      assert((unsigned)(4 - (s.size() % 4)) <= 4);
+      writeBytes("\0\0\0\0", 4 - (s.size() % 4));
+    }
+
+    raw_ostream *os;
+  };
+  const char *const GCOVRecord::LinesTag = "\0\0\x45\x01";
+  const char *const GCOVRecord::FunctionTag = "\0\0\0\1";
+  const char *const GCOVRecord::BlockTag = "\0\0\x41\x01";
+  const char *const GCOVRecord::EdgeTag = "\0\0\x43\x01";
+
+  class GCOVFunction;
+  class GCOVBlock;
+
+  // Constructed only by requesting it from a GCOVBlock, this object stores a
+  // list of line numbers and a single filename, representing lines that belong
+  // to the block.
+  class GCOVLines : public GCOVRecord {
+   public:
+    void addLine(uint32_t Line) {
+      assert(Line != 0 && "Line zero is not a valid real line number.");
+      Lines.push_back(Line);
+    }
+
+    uint32_t length() const {
+      // Here 2 = 1 for string length + 1 for '0' id#.
+      return lengthOfGCOVString(Filename) + 2 + Lines.size();
+    }
+
+    void writeOut() {
+      write(0);
+      writeGCOVString(Filename);
+      for (int i = 0, e = Lines.size(); i != e; ++i)
+        write(Lines[i]);
+    }
+
+    GCOVLines(StringRef F, raw_ostream *os)
+      : Filename(F) {
+      this->os = os;
+    }
+
+   private:
+    StringRef Filename;
+    SmallVector<uint32_t, 32> Lines;
+  };
+
+
+  // Represent a basic block in GCOV. Each block has a unique number in the
+  // function, number of lines belonging to each block, and a set of edges to
+  // other blocks.
+  class GCOVBlock : public GCOVRecord {
+   public:
+    GCOVLines &getFile(StringRef Filename) {
+      GCOVLines *&Lines = LinesByFile[Filename];
+      if (!Lines) {
+        Lines = new GCOVLines(Filename, os);
+      }
+      return *Lines;
+    }
+
+    void addEdge(GCOVBlock &Successor) {
+      OutEdges.push_back(&Successor);
+    }
+
+    void writeOut() {
+      uint32_t Len = 3;
+      SmallVector<StringMapEntry<GCOVLines *> *, 32> SortedLinesByFile;
+      for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(),
+               E = LinesByFile.end(); I != E; ++I) {
+        Len += I->second->length();
+        SortedLinesByFile.push_back(&*I);
+      }
+
+      writeBytes(LinesTag, 4);
+      write(Len);
+      write(Number);
+
+      std::sort(SortedLinesByFile.begin(), SortedLinesByFile.end(),
+                [](StringMapEntry<GCOVLines *> *LHS,
+                   StringMapEntry<GCOVLines *> *RHS) {
+        return LHS->getKey() < RHS->getKey();
+      });
+      for (SmallVectorImpl<StringMapEntry<GCOVLines *> *>::iterator
+               I = SortedLinesByFile.begin(), E = SortedLinesByFile.end();
+           I != E; ++I)
+        (*I)->getValue()->writeOut();
+      write(0);
+      write(0);
+    }
+
+    ~GCOVBlock() {
+      DeleteContainerSeconds(LinesByFile);
+    }
+
+    GCOVBlock(const GCOVBlock &RHS) : GCOVRecord(RHS), Number(RHS.Number) {
+      // Only allow copy before edges and lines have been added. After that,
+      // there are inter-block pointers (eg: edges) that won't take kindly to
+      // blocks being copied or moved around.
+      assert(LinesByFile.empty());
+      assert(OutEdges.empty());
+    }
+
+   private:
+    friend class GCOVFunction;
+
+    GCOVBlock(uint32_t Number, raw_ostream *os)
+        : Number(Number) {
+      this->os = os;
+    }
+
+    uint32_t Number;
+    StringMap<GCOVLines *> LinesByFile;
+    SmallVector<GCOVBlock *, 4> OutEdges;
+  };
+
+  // A function has a unique identifier, a checksum (we leave as zero) and a
+  // set of blocks and a map of edges between blocks. This is the only GCOV
+  // object users can construct, the blocks and lines will be rooted here.
+  class GCOVFunction : public GCOVRecord {
+   public:
+     GCOVFunction(const DISubprogram *SP, raw_ostream *os, uint32_t Ident,
+                  bool UseCfgChecksum, bool ExitBlockBeforeBody)
+         : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0),
+           ReturnBlock(1, os) {
+      this->os = os;
+
+      Function *F = SP->getFunction();
+      DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
+
+      uint32_t i = 0;
+      for (auto &BB : *F) {
+        // Skip index 1 if it's assigned to the ReturnBlock.
+        if (i == 1 && ExitBlockBeforeBody)
+          ++i;
+        Blocks.insert(std::make_pair(&BB, GCOVBlock(i++, os)));
+      }
+      if (!ExitBlockBeforeBody)
+        ReturnBlock.Number = i;
+
+      std::string FunctionNameAndLine;
+      raw_string_ostream FNLOS(FunctionNameAndLine);
+      FNLOS << getFunctionName(SP) << SP->getLine();
+      FNLOS.flush();
+      FuncChecksum = hash_value(FunctionNameAndLine);
+    }
+
+    GCOVBlock &getBlock(BasicBlock *BB) {
+      return Blocks.find(BB)->second;
+    }
+
+    GCOVBlock &getReturnBlock() {
+      return ReturnBlock;
+    }
+
+    std::string getEdgeDestinations() {
+      std::string EdgeDestinations;
+      raw_string_ostream EDOS(EdgeDestinations);
+      Function *F = Blocks.begin()->first->getParent();
+      for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+        GCOVBlock &Block = getBlock(I);
+        for (int i = 0, e = Block.OutEdges.size(); i != e; ++i)
+          EDOS << Block.OutEdges[i]->Number;
+      }
+      return EdgeDestinations;
+    }
+
+    uint32_t getFuncChecksum() {
+      return FuncChecksum;
+    }
+
+    void setCfgChecksum(uint32_t Checksum) {
+      CfgChecksum = Checksum;
+    }
+
+    void writeOut() {
+      writeBytes(FunctionTag, 4);
+      uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) +
+                          1 + lengthOfGCOVString(SP->getFilename()) + 1;
+      if (UseCfgChecksum)
+        ++BlockLen;
+      write(BlockLen);
+      write(Ident);
+      write(FuncChecksum);
+      if (UseCfgChecksum)
+        write(CfgChecksum);
+      writeGCOVString(getFunctionName(SP));
+      writeGCOVString(SP->getFilename());
+      write(SP->getLine());
+
+      // Emit count of blocks.
+      writeBytes(BlockTag, 4);
+      write(Blocks.size() + 1);
+      for (int i = 0, e = Blocks.size() + 1; i != e; ++i) {
+        write(0);  // No flags on our blocks.
+      }
+      DEBUG(dbgs() << Blocks.size() << " blocks.\n");
+
+      // Emit edges between blocks.
+      if (Blocks.empty()) return;
+      Function *F = Blocks.begin()->first->getParent();
+      for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+        GCOVBlock &Block = getBlock(I);
+        if (Block.OutEdges.empty()) continue;
+
+        writeBytes(EdgeTag, 4);
+        write(Block.OutEdges.size() * 2 + 1);
+        write(Block.Number);
+        for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) {
+          DEBUG(dbgs() << Block.Number << " -> " << Block.OutEdges[i]->Number
+                       << "\n");
+          write(Block.OutEdges[i]->Number);
+          write(0);  // no flags
+        }
+      }
+
+      // Emit lines for each block.
+      for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+        getBlock(I).writeOut();
+      }
+    }
+
+   private:
+     const DISubprogram *SP;
+    uint32_t Ident;
+    uint32_t FuncChecksum;
+    bool UseCfgChecksum;
+    uint32_t CfgChecksum;
+    DenseMap<BasicBlock *, GCOVBlock> Blocks;
+    GCOVBlock ReturnBlock;
+  };
+}
+
+std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
+                                     const char *NewStem) {
+  if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
+    for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
+      MDNode *N = GCov->getOperand(i);
+      if (N->getNumOperands() != 2) continue;
+      MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0));
+      MDNode *CompileUnit = dyn_cast<MDNode>(N->getOperand(1));
+      if (!GCovFile || !CompileUnit) continue;
+      if (CompileUnit == CU) {
+        SmallString<128> Filename = GCovFile->getString();
+        sys::path::replace_extension(Filename, NewStem);
+        return Filename.str();
+      }
+    }
+  }
+
+  SmallString<128> Filename = CU->getFilename();
+  sys::path::replace_extension(Filename, NewStem);
+  StringRef FName = sys::path::filename(Filename);
+  SmallString<128> CurPath;
+  if (sys::fs::current_path(CurPath)) return FName;
+  sys::path::append(CurPath, FName);
+  return CurPath.str();
+}
+
+bool GCOVProfiler::runOnModule(Module &M) {
+  this->M = &M;
+  Ctx = &M.getContext();
+
+  if (Options.EmitNotes) emitProfileNotes();
+  if (Options.EmitData) return emitProfileArcs();
+  return false;
+}
+
+static bool functionHasLines(Function *F) {
+  // Check whether this function actually has any source lines. Not only
+  // do these waste space, they also can crash gcov.
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), IE = BB->end();
+         I != IE; ++I) {
+      // Debug intrinsic locations correspond to the location of the
+      // declaration, not necessarily any statements or expressions.
+      if (isa<DbgInfoIntrinsic>(I)) continue;
+
+      const DebugLoc &Loc = I->getDebugLoc();
+      if (!Loc)
+        continue;
+
+      // Artificial lines such as calls to the global constructors.
+      if (Loc.getLine() == 0) continue;
+
+      return true;
+    }
+  }
+  return false;
+}
+
+void GCOVProfiler::emitProfileNotes() {
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes) return;
+
+  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+    // Each compile unit gets its own .gcno file. This means that whether we run
+    // this pass over the original .o's as they're produced, or run it after
+    // LTO, we'll generate the same .gcno files.
+
+    auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
+    std::error_code EC;
+    raw_fd_ostream out(mangleName(CU, "gcno"), EC, sys::fs::F_None);
+    std::string EdgeDestinations;
+
+    unsigned FunctionIdent = 0;
+    for (auto *SP : CU->getSubprograms()) {
+      Function *F = SP->getFunction();
+      if (!F) continue;
+      if (!functionHasLines(F)) continue;
+
+      // gcov expects every function to start with an entry block that has a
+      // single successor, so split the entry block to make sure of that.
+      BasicBlock &EntryBlock = F->getEntryBlock();
+      BasicBlock::iterator It = EntryBlock.begin();
+      while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
+        ++It;
+      EntryBlock.splitBasicBlock(It);
+
+      Funcs.push_back(make_unique<GCOVFunction>(SP, &out, FunctionIdent++,
+                                                Options.UseCfgChecksum,
+                                                Options.ExitBlockBeforeBody));
+      GCOVFunction &Func = *Funcs.back();
+
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        GCOVBlock &Block = Func.getBlock(BB);
+        TerminatorInst *TI = BB->getTerminator();
+        if (int successors = TI->getNumSuccessors()) {
+          for (int i = 0; i != successors; ++i) {
+            Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
+          }
+        } else if (isa<ReturnInst>(TI)) {
+          Block.addEdge(Func.getReturnBlock());
+        }
+
+        uint32_t Line = 0;
+        for (BasicBlock::iterator I = BB->begin(), IE = BB->end();
+             I != IE; ++I) {
+          // Debug intrinsic locations correspond to the location of the
+          // declaration, not necessarily any statements or expressions.
+          if (isa<DbgInfoIntrinsic>(I)) continue;
+
+          const DebugLoc &Loc = I->getDebugLoc();
+          if (!Loc)
+            continue;
+
+          // Artificial lines such as calls to the global constructors.
+          if (Loc.getLine() == 0) continue;
+
+          if (Line == Loc.getLine()) continue;
+          Line = Loc.getLine();
+          if (SP != getDISubprogram(Loc.getScope()))
+            continue;
+
+          GCOVLines &Lines = Block.getFile(SP->getFilename());
+          Lines.addLine(Loc.getLine());
+        }
+      }
+      EdgeDestinations += Func.getEdgeDestinations();
+    }
+
+    FileChecksums.push_back(hash_value(EdgeDestinations));
+    out.write("oncg", 4);
+    out.write(ReversedVersion, 4);
+    out.write(reinterpret_cast<char*>(&FileChecksums.back()), 4);
+
+    for (auto &Func : Funcs) {
+      Func->setCfgChecksum(FileChecksums.back());
+      Func->writeOut();
+    }
+
+    out.write("\0\0\0\0\0\0\0\0", 8);  // EOF
+    out.close();
+  }
+}
+
+bool GCOVProfiler::emitProfileArcs() {
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes) return false;
+
+  bool Result = false;
+  bool InsertIndCounterIncrCode = false;
+  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+    auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
+    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
+    for (auto *SP : CU->getSubprograms()) {
+      Function *F = SP->getFunction();
+      if (!F) continue;
+      if (!functionHasLines(F)) continue;
+      if (!Result) Result = true;
+      unsigned Edges = 0;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        TerminatorInst *TI = BB->getTerminator();
+        if (isa<ReturnInst>(TI))
+          ++Edges;
+        else
+          Edges += TI->getNumSuccessors();
+      }
+
+      ArrayType *CounterTy =
+        ArrayType::get(Type::getInt64Ty(*Ctx), Edges);
+      GlobalVariable *Counters =
+        new GlobalVariable(*M, CounterTy, false,
+                           GlobalValue::InternalLinkage,
+                           Constant::getNullValue(CounterTy),
+                           "__llvm_gcov_ctr");
+      CountersBySP.push_back(std::make_pair(Counters, SP));
+
+      UniqueVector<BasicBlock *> ComplexEdgePreds;
+      UniqueVector<BasicBlock *> ComplexEdgeSuccs;
+
+      unsigned Edge = 0;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        TerminatorInst *TI = BB->getTerminator();
+        int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
+        if (Successors) {
+          if (Successors == 1) {
+            IRBuilder<> Builder(BB->getFirstInsertionPt());
+            Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
+                                                                Edge);
+            Value *Count = Builder.CreateLoad(Counter);
+            Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+            Builder.CreateStore(Count, Counter);
+          } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+            IRBuilder<> Builder(BI);
+            Value *Sel = Builder.CreateSelect(BI->getCondition(),
+                                              Builder.getInt64(Edge),
+                                              Builder.getInt64(Edge + 1));
+            SmallVector<Value *, 2> Idx;
+            Idx.push_back(Builder.getInt64(0));
+            Idx.push_back(Sel);
+            Value *Counter = Builder.CreateInBoundsGEP(Counters->getValueType(),
+                                                       Counters, Idx);
+            Value *Count = Builder.CreateLoad(Counter);
+            Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+            Builder.CreateStore(Count, Counter);
+          } else {
+            ComplexEdgePreds.insert(BB);
+            for (int i = 0; i != Successors; ++i)
+              ComplexEdgeSuccs.insert(TI->getSuccessor(i));
+          }
+
+          Edge += Successors;
+        }
+      }
+
+      if (!ComplexEdgePreds.empty()) {
+        GlobalVariable *EdgeTable =
+          buildEdgeLookupTable(F, Counters,
+                               ComplexEdgePreds, ComplexEdgeSuccs);
+        GlobalVariable *EdgeState = getEdgeStateValue();
+
+        for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
+          IRBuilder<> Builder(ComplexEdgePreds[i + 1]->getFirstInsertionPt());
+          Builder.CreateStore(Builder.getInt32(i), EdgeState);
+        }
+
+        for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
+          // Call runtime to perform increment.
+          IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstInsertionPt());
+          Value *CounterPtrArray =
+            Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,
+                                               i * ComplexEdgePreds.size());
+
+          // Build code to increment the counter.
+          InsertIndCounterIncrCode = true;
+          Builder.CreateCall(getIncrementIndirectCounterFunc(),
+                             {EdgeState, CounterPtrArray});
+        }
+      }
+    }
+
+    Function *WriteoutF = insertCounterWriteout(CountersBySP);
+    Function *FlushF = insertFlush(CountersBySP);
+
+    // Create a small bit of code that registers the "__llvm_gcov_writeout" to
+    // be executed at exit and the "__llvm_gcov_flush" function to be executed
+    // when "__gcov_flush" is called.
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
+                                   "__llvm_gcov_init", M);
+    F->setUnnamedAddr(true);
+    F->setLinkage(GlobalValue::InternalLinkage);
+    F->addFnAttr(Attribute::NoInline);
+    if (Options.NoRedZone)
+      F->addFnAttr(Attribute::NoRedZone);
+
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    IRBuilder<> Builder(BB);
+
+    FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Type *Params[] = {
+      PointerType::get(FTy, 0),
+      PointerType::get(FTy, 0)
+    };
+    FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
+
+    // Initialize the environment and register the local writeout and flush
+    // functions.
+    Constant *GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
+    Builder.CreateCall(GCOVInit, {WriteoutF, FlushF});
+    Builder.CreateRetVoid();
+
+    appendToGlobalCtors(*M, F, 0);
+  }
+
+  if (InsertIndCounterIncrCode)
+    insertIndirectCounterIncrement();
+
+  return Result;
+}
+
+// All edges with successors that aren't branches are "complex", because it
+// requires complex logic to pick which counter to update.
+GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
+    Function *F,
+    GlobalVariable *Counters,
+    const UniqueVector<BasicBlock *> &Preds,
+    const UniqueVector<BasicBlock *> &Succs) {
+  // TODO: support invoke, threads. We rely on the fact that nothing can modify
+  // the whole-Module pred edge# between the time we set it and the time we next
+  // read it. Threads and invoke make this untrue.
+
+  // emit [(succs * preds) x i64*], logically [succ x [pred x i64*]].
+  size_t TableSize = Succs.size() * Preds.size();
+  Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
+  ArrayType *EdgeTableTy = ArrayType::get(Int64PtrTy, TableSize);
+
+  std::unique_ptr<Constant * []> EdgeTable(new Constant *[TableSize]);
+  Constant *NullValue = Constant::getNullValue(Int64PtrTy);
+  for (size_t i = 0; i != TableSize; ++i)
+    EdgeTable[i] = NullValue;
+
+  unsigned Edge = 0;
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    TerminatorInst *TI = BB->getTerminator();
+    int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
+    if (Successors > 1 && !isa<BranchInst>(TI) && !isa<ReturnInst>(TI)) {
+      for (int i = 0; i != Successors; ++i) {
+        BasicBlock *Succ = TI->getSuccessor(i);
+        IRBuilder<> Builder(Succ);
+        Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
+                                                            Edge + i);
+        EdgeTable[((Succs.idFor(Succ)-1) * Preds.size()) +
+                  (Preds.idFor(BB)-1)] = cast<Constant>(Counter);
+      }
+    }
+    Edge += Successors;
+  }
+
+  GlobalVariable *EdgeTableGV =
+      new GlobalVariable(
+          *M, EdgeTableTy, true, GlobalValue::InternalLinkage,
+          ConstantArray::get(EdgeTableTy,
+                             makeArrayRef(&EdgeTable[0],TableSize)),
+          "__llvm_gcda_edge_table");
+  EdgeTableGV->setUnnamedAddr(true);
+  return EdgeTableGV;
+}
+
+Constant *GCOVProfiler::getStartFileFunc() {
+  Type *Args[] = {
+    Type::getInt8PtrTy(*Ctx),  // const char *orig_filename
+    Type::getInt8PtrTy(*Ctx),  // const char version[4]
+    Type::getInt32Ty(*Ctx),    // uint32_t checksum
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("llvm_gcda_start_file", FTy);
+}
+
+Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
+  Type *Int32Ty = Type::getInt32Ty(*Ctx);
+  Type *Int64Ty = Type::getInt64Ty(*Ctx);
+  Type *Args[] = {
+    Int32Ty->getPointerTo(),                // uint32_t *predecessor
+    Int64Ty->getPointerTo()->getPointerTo() // uint64_t **counters
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("__llvm_gcov_indirect_counter_increment", FTy);
+}
+
+Constant *GCOVProfiler::getEmitFunctionFunc() {
+  Type *Args[] = {
+    Type::getInt32Ty(*Ctx),    // uint32_t ident
+    Type::getInt8PtrTy(*Ctx),  // const char *function_name
+    Type::getInt32Ty(*Ctx),    // uint32_t func_checksum
+    Type::getInt8Ty(*Ctx),     // uint8_t use_extra_checksum
+    Type::getInt32Ty(*Ctx),    // uint32_t cfg_checksum
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
+}
+
+Constant *GCOVProfiler::getEmitArcsFunc() {
+  Type *Args[] = {
+    Type::getInt32Ty(*Ctx),     // uint32_t num_counters
+    Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
+}
+
+Constant *GCOVProfiler::getSummaryInfoFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_summary_info", FTy);
+}
+
+Constant *GCOVProfiler::getDeleteWriteoutFunctionListFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_delete_writeout_function_list", FTy);
+}
+
+Constant *GCOVProfiler::getDeleteFlushFunctionListFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_delete_flush_function_list", FTy);
+}
+
+Constant *GCOVProfiler::getEndFileFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
+}
+
+GlobalVariable *GCOVProfiler::getEdgeStateValue() {
+  GlobalVariable *GV = M->getGlobalVariable("__llvm_gcov_global_state_pred");
+  if (!GV) {
+    GV = new GlobalVariable(*M, Type::getInt32Ty(*Ctx), false,
+                            GlobalValue::InternalLinkage,
+                            ConstantInt::get(Type::getInt32Ty(*Ctx),
+                                             0xffffffff),
+                            "__llvm_gcov_global_state_pred");
+    GV->setUnnamedAddr(true);
+  }
+  return GV;
+}
+
+Function *GCOVProfiler::insertCounterWriteout(
+    ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
+  FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  if (!WriteoutF)
+    WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage,
+                                 "__llvm_gcov_writeout", M);
+  WriteoutF->setUnnamedAddr(true);
+  WriteoutF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    WriteoutF->addFnAttr(Attribute::NoRedZone);
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
+  IRBuilder<> Builder(BB);
+
+  Constant *StartFile = getStartFileFunc();
+  Constant *EmitFunction = getEmitFunctionFunc();
+  Constant *EmitArcs = getEmitArcsFunc();
+  Constant *SummaryInfo = getSummaryInfoFunc();
+  Constant *EndFile = getEndFileFunc();
+
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (CU_Nodes) {
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+      auto *CU = cast<DICompileUnit>(CU_Nodes->getOperand(i));
+      std::string FilenameGcda = mangleName(CU, "gcda");
+      uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
+      Builder.CreateCall(StartFile,
+                         {Builder.CreateGlobalStringPtr(FilenameGcda),
+                          Builder.CreateGlobalStringPtr(ReversedVersion),
+                          Builder.getInt32(CfgChecksum)});
+      for (unsigned j = 0, e = CountersBySP.size(); j != e; ++j) {
+        auto *SP = cast_or_null<DISubprogram>(CountersBySP[j].second);
+        uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum();
+        Builder.CreateCall(
+            EmitFunction,
+            {Builder.getInt32(j),
+             Options.FunctionNamesInData
+                 ? Builder.CreateGlobalStringPtr(getFunctionName(SP))
+                 : Constant::getNullValue(Builder.getInt8PtrTy()),
+             Builder.getInt32(FuncChecksum),
+             Builder.getInt8(Options.UseCfgChecksum),
+             Builder.getInt32(CfgChecksum)});
+
+        GlobalVariable *GV = CountersBySP[j].first;
+        unsigned Arcs =
+          cast<ArrayType>(GV->getType()->getElementType())->getNumElements();
+        Builder.CreateCall(EmitArcs, {Builder.getInt32(Arcs),
+                                      Builder.CreateConstGEP2_64(GV, 0, 0)});
+      }
+      Builder.CreateCall(SummaryInfo, {});
+      Builder.CreateCall(EndFile, {});
+    }
+  }
+
+  Builder.CreateRetVoid();
+  return WriteoutF;
+}
+
+void GCOVProfiler::insertIndirectCounterIncrement() {
+  Function *Fn =
+    cast<Function>(GCOVProfiler::getIncrementIndirectCounterFunc());
+  Fn->setUnnamedAddr(true);
+  Fn->setLinkage(GlobalValue::InternalLinkage);
+  Fn->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    Fn->addFnAttr(Attribute::NoRedZone);
+
+  // Create basic blocks for function.
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", Fn);
+  IRBuilder<> Builder(BB);
+
+  BasicBlock *PredNotNegOne = BasicBlock::Create(*Ctx, "", Fn);
+  BasicBlock *CounterEnd = BasicBlock::Create(*Ctx, "", Fn);
+  BasicBlock *Exit = BasicBlock::Create(*Ctx, "exit", Fn);
+
+  // uint32_t pred = *predecessor;
+  // if (pred == 0xffffffff) return;
+  Argument *Arg = Fn->arg_begin();
+  Arg->setName("predecessor");
+  Value *Pred = Builder.CreateLoad(Arg, "pred");
+  Value *Cond = Builder.CreateICmpEQ(Pred, Builder.getInt32(0xffffffff));
+  BranchInst::Create(Exit, PredNotNegOne, Cond, BB);
+
+  Builder.SetInsertPoint(PredNotNegOne);
+
+  // uint64_t *counter = counters[pred];
+  // if (!counter) return;
+  Value *ZExtPred = Builder.CreateZExt(Pred, Builder.getInt64Ty());
+  Arg = std::next(Fn->arg_begin());
+  Arg->setName("counters");
+  Value *GEP = Builder.CreateGEP(Type::getInt64PtrTy(*Ctx), Arg, ZExtPred);
+  Value *Counter = Builder.CreateLoad(GEP, "counter");
+  Cond = Builder.CreateICmpEQ(Counter,
+                              Constant::getNullValue(
+                                  Builder.getInt64Ty()->getPointerTo()));
+  Builder.CreateCondBr(Cond, Exit, CounterEnd);
+
+  // ++*counter;
+  Builder.SetInsertPoint(CounterEnd);
+  Value *Add = Builder.CreateAdd(Builder.CreateLoad(Counter),
+                                 Builder.getInt64(1));
+  Builder.CreateStore(Add, Counter);
+  Builder.CreateBr(Exit);
+
+  // Fill in the exit block.
+  Builder.SetInsertPoint(Exit);
+  Builder.CreateRetVoid();
+}
+
+Function *GCOVProfiler::
+insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *FlushF = M->getFunction("__llvm_gcov_flush");
+  if (!FlushF)
+    FlushF = Function::Create(FTy, GlobalValue::InternalLinkage,
+                              "__llvm_gcov_flush", M);
+  else
+    FlushF->setLinkage(GlobalValue::InternalLinkage);
+  FlushF->setUnnamedAddr(true);
+  FlushF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    FlushF->addFnAttr(Attribute::NoRedZone);
+
+  BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", FlushF);
+
+  // Write out the current counters.
+  Constant *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  assert(WriteoutF && "Need to create the writeout function first!");
+
+  IRBuilder<> Builder(Entry);
+  Builder.CreateCall(WriteoutF, {});
+
+  // Zero out the counters.
+  for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator
+         I = CountersBySP.begin(), E = CountersBySP.end();
+       I != E; ++I) {
+    GlobalVariable *GV = I->first;
+    Constant *Null = Constant::getNullValue(GV->getType()->getElementType());
+    Builder.CreateStore(Null, GV);
+  }
+
+  Type *RetTy = FlushF->getReturnType();
+  if (RetTy == Type::getVoidTy(*Ctx))
+    Builder.CreateRetVoid();
+  else if (RetTy->isIntegerTy())
+    // Used if __llvm_gcov_flush was implicitly declared.
+    Builder.CreateRet(ConstantInt::get(RetTy, 0));
+  else
+    report_fatal_error("invalid return type for __llvm_gcov_flush");
+
+  return FlushF;
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
new file mode 100644
index 0000000..712bf8e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -0,0 +1,377 @@
+//===-- InstrProfiling.cpp - Frontend instrumentation based profiling -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers instrprof_increment intrinsics emitted by a frontend for
+// profiling. It also builds the data structures and initialization code needed
+// for updating execution counts and emitting the profile at runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "instrprof"
+
+namespace {
+
+class InstrProfiling : public ModulePass {
+public:
+  static char ID;
+
+  InstrProfiling() : ModulePass(ID) {}
+
+  InstrProfiling(const InstrProfOptions &Options)
+      : ModulePass(ID), Options(Options) {}
+
+  const char *getPassName() const override {
+    return "Frontend instrumentation-based coverage lowering";
+  }
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+private:
+  InstrProfOptions Options;
+  Module *M;
+  DenseMap<GlobalVariable *, GlobalVariable *> RegionCounters;
+  std::vector<Value *> UsedVars;
+
+  bool isMachO() const {
+    return Triple(M->getTargetTriple()).isOSBinFormatMachO();
+  }
+
+  /// Get the section name for the counter variables.
+  StringRef getCountersSection() const {
+    return isMachO() ? "__DATA,__llvm_prf_cnts" : "__llvm_prf_cnts";
+  }
+
+  /// Get the section name for the name variables.
+  StringRef getNameSection() const {
+    return isMachO() ? "__DATA,__llvm_prf_names" : "__llvm_prf_names";
+  }
+
+  /// Get the section name for the profile data variables.
+  StringRef getDataSection() const {
+    return isMachO() ? "__DATA,__llvm_prf_data" : "__llvm_prf_data";
+  }
+
+  /// Get the section name for the coverage mapping data.
+  StringRef getCoverageSection() const {
+    return isMachO() ? "__DATA,__llvm_covmap" : "__llvm_covmap";
+  }
+
+  /// Replace instrprof_increment with an increment of the appropriate value.
+  void lowerIncrement(InstrProfIncrementInst *Inc);
+
+  /// Set up the section and uses for coverage data and its references.
+  void lowerCoverageData(GlobalVariable *CoverageData);
+
+  /// Get the region counters for an increment, creating them if necessary.
+  ///
+  /// If the counter array doesn't yet exist, the profile data variables
+  /// referring to them will also be created.
+  GlobalVariable *getOrCreateRegionCounters(InstrProfIncrementInst *Inc);
+
+  /// Emit runtime registration functions for each profile data variable.
+  void emitRegistration();
+
+  /// Emit the necessary plumbing to pull in the runtime initialization.
+  void emitRuntimeHook();
+
+  /// Add uses of our data variables and runtime hook.
+  void emitUses();
+
+  /// Create a static initializer for our data, on platforms that need it,
+  /// and for any profile output file that was specified.
+  void emitInitialization();
+};
+
+} // anonymous namespace
+
+char InstrProfiling::ID = 0;
+INITIALIZE_PASS(InstrProfiling, "instrprof",
+                "Frontend instrumentation-based coverage lowering.", false,
+                false)
+
+ModulePass *llvm::createInstrProfilingPass(const InstrProfOptions &Options) {
+  return new InstrProfiling(Options);
+}
+
+bool InstrProfiling::runOnModule(Module &M) {
+  bool MadeChange = false;
+
+  this->M = &M;
+  RegionCounters.clear();
+  UsedVars.clear();
+
+  for (Function &F : M)
+    for (BasicBlock &BB : F)
+      for (auto I = BB.begin(), E = BB.end(); I != E;)
+        if (auto *Inc = dyn_cast<InstrProfIncrementInst>(I++)) {
+          lowerIncrement(Inc);
+          MadeChange = true;
+        }
+  if (GlobalVariable *Coverage = M.getNamedGlobal("__llvm_coverage_mapping")) {
+    lowerCoverageData(Coverage);
+    MadeChange = true;
+  }
+  if (!MadeChange)
+    return false;
+
+  emitRegistration();
+  emitRuntimeHook();
+  emitUses();
+  emitInitialization();
+  return true;
+}
+
+void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
+  GlobalVariable *Counters = getOrCreateRegionCounters(Inc);
+
+  IRBuilder<> Builder(Inc->getParent(), *Inc);
+  uint64_t Index = Inc->getIndex()->getZExtValue();
+  Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index);
+  Value *Count = Builder.CreateLoad(Addr, "pgocount");
+  Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+  Inc->replaceAllUsesWith(Builder.CreateStore(Count, Addr));
+  Inc->eraseFromParent();
+}
+
+void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageData) {
+  CoverageData->setSection(getCoverageSection());
+  CoverageData->setAlignment(8);
+
+  Constant *Init = CoverageData->getInitializer();
+  // We're expecting { i32, i32, i32, i32, [n x { i8*, i32, i32 }], [m x i8] }
+  // for some C. If not, the frontend's given us something broken.
+  assert(Init->getNumOperands() == 6 && "bad number of fields in coverage map");
+  assert(isa<ConstantArray>(Init->getAggregateElement(4)) &&
+         "invalid function list in coverage map");
+  ConstantArray *Records = cast<ConstantArray>(Init->getAggregateElement(4));
+  for (unsigned I = 0, E = Records->getNumOperands(); I < E; ++I) {
+    Constant *Record = Records->getOperand(I);
+    Value *V = const_cast<Value *>(Record->getOperand(0))->stripPointerCasts();
+
+    assert(isa<GlobalVariable>(V) && "Missing reference to function name");
+    GlobalVariable *Name = cast<GlobalVariable>(V);
+
+    // If we have region counters for this name, we've already handled it.
+    auto It = RegionCounters.find(Name);
+    if (It != RegionCounters.end())
+      continue;
+
+    // Move the name variable to the right section.
+    Name->setSection(getNameSection());
+    Name->setAlignment(1);
+  }
+}
+
+/// Get the name of a profiling variable for a particular function.
+static std::string getVarName(InstrProfIncrementInst *Inc, StringRef VarName) {
+  auto *Arr = cast<ConstantDataArray>(Inc->getName()->getInitializer());
+  StringRef Name = Arr->isCString() ? Arr->getAsCString() : Arr->getAsString();
+  return ("__llvm_profile_" + VarName + "_" + Name).str();
+}
+
+GlobalVariable *
+InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
+  GlobalVariable *Name = Inc->getName();
+  auto It = RegionCounters.find(Name);
+  if (It != RegionCounters.end())
+    return It->second;
+
+  // Move the name variable to the right section. Make sure it is placed in the
+  // same comdat as its associated function. Otherwise, we may get multiple
+  // counters for the same function in certain cases.
+  Function *Fn = Inc->getParent()->getParent();
+  Name->setSection(getNameSection());
+  Name->setAlignment(1);
+  Name->setComdat(Fn->getComdat());
+
+  uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
+  LLVMContext &Ctx = M->getContext();
+  ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
+
+  // Create the counters variable.
+  auto *Counters = new GlobalVariable(*M, CounterTy, false, Name->getLinkage(),
+                                      Constant::getNullValue(CounterTy),
+                                      getVarName(Inc, "counters"));
+  Counters->setVisibility(Name->getVisibility());
+  Counters->setSection(getCountersSection());
+  Counters->setAlignment(8);
+  Counters->setComdat(Fn->getComdat());
+
+  RegionCounters[Inc->getName()] = Counters;
+
+  // Create data variable.
+  auto *NameArrayTy = Name->getType()->getPointerElementType();
+  auto *Int32Ty = Type::getInt32Ty(Ctx);
+  auto *Int64Ty = Type::getInt64Ty(Ctx);
+  auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
+  auto *Int64PtrTy = Type::getInt64PtrTy(Ctx);
+
+  Type *DataTypes[] = {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int64PtrTy};
+  auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes));
+  Constant *DataVals[] = {
+      ConstantInt::get(Int32Ty, NameArrayTy->getArrayNumElements()),
+      ConstantInt::get(Int32Ty, NumCounters),
+      ConstantInt::get(Int64Ty, Inc->getHash()->getZExtValue()),
+      ConstantExpr::getBitCast(Name, Int8PtrTy),
+      ConstantExpr::getBitCast(Counters, Int64PtrTy)};
+  auto *Data = new GlobalVariable(*M, DataTy, true, Name->getLinkage(),
+                                  ConstantStruct::get(DataTy, DataVals),
+                                  getVarName(Inc, "data"));
+  Data->setVisibility(Name->getVisibility());
+  Data->setSection(getDataSection());
+  Data->setAlignment(8);
+  Data->setComdat(Fn->getComdat());
+
+  // Mark the data variable as used so that it isn't stripped out.
+  UsedVars.push_back(Data);
+
+  return Counters;
+}
+
+void InstrProfiling::emitRegistration() {
+  // Don't do this for Darwin.  compiler-rt uses linker magic.
+  if (Triple(M->getTargetTriple()).isOSDarwin())
+    return;
+
+  // Construct the function.
+  auto *VoidTy = Type::getVoidTy(M->getContext());
+  auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext());
+  auto *RegisterFTy = FunctionType::get(VoidTy, false);
+  auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage,
+                                     "__llvm_profile_register_functions", M);
+  RegisterF->setUnnamedAddr(true);
+  if (Options.NoRedZone)
+    RegisterF->addFnAttr(Attribute::NoRedZone);
+
+  auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false);
+  auto *RuntimeRegisterF =
+      Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage,
+                       "__llvm_profile_register_function", M);
+
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF));
+  for (Value *Data : UsedVars)
+    IRB.CreateCall(RuntimeRegisterF, IRB.CreateBitCast(Data, VoidPtrTy));
+  IRB.CreateRetVoid();
+}
+
+void InstrProfiling::emitRuntimeHook() {
+  const char *const RuntimeVarName = "__llvm_profile_runtime";
+  const char *const RuntimeUserName = "__llvm_profile_runtime_user";
+
+  // If the module's provided its own runtime, we don't need to do anything.
+  if (M->getGlobalVariable(RuntimeVarName))
+    return;
+
+  // Declare an external variable that will pull in the runtime initialization.
+  auto *Int32Ty = Type::getInt32Ty(M->getContext());
+  auto *Var =
+      new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
+                         nullptr, RuntimeVarName);
+
+  // Make a function that uses it.
+  auto *User =
+      Function::Create(FunctionType::get(Int32Ty, false),
+                       GlobalValue::LinkOnceODRLinkage, RuntimeUserName, M);
+  User->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    User->addFnAttr(Attribute::NoRedZone);
+  User->setVisibility(GlobalValue::HiddenVisibility);
+
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User));
+  auto *Load = IRB.CreateLoad(Var);
+  IRB.CreateRet(Load);
+
+  // Mark the user variable as used so that it isn't stripped out.
+  UsedVars.push_back(User);
+}
+
+void InstrProfiling::emitUses() {
+  if (UsedVars.empty())
+    return;
+
+  GlobalVariable *LLVMUsed = M->getGlobalVariable("llvm.used");
+  std::vector<Constant *> MergedVars;
+  if (LLVMUsed) {
+    // Collect the existing members of llvm.used.
+    ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+    for (unsigned I = 0, E = Inits->getNumOperands(); I != E; ++I)
+      MergedVars.push_back(Inits->getOperand(I));
+    LLVMUsed->eraseFromParent();
+  }
+
+  Type *i8PTy = Type::getInt8PtrTy(M->getContext());
+  // Add uses for our data.
+  for (auto *Value : UsedVars)
+    MergedVars.push_back(
+        ConstantExpr::getBitCast(cast<Constant>(Value), i8PTy));
+
+  // Recreate llvm.used.
+  ArrayType *ATy = ArrayType::get(i8PTy, MergedVars.size());
+  LLVMUsed =
+      new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage,
+                         ConstantArray::get(ATy, MergedVars), "llvm.used");
+
+  LLVMUsed->setSection("llvm.metadata");
+}
+
+void InstrProfiling::emitInitialization() {
+  std::string InstrProfileOutput = Options.InstrProfileOutput;
+
+  Constant *RegisterF = M->getFunction("__llvm_profile_register_functions");
+  if (!RegisterF && InstrProfileOutput.empty())
+    return;
+
+  // Create the initialization function.
+  auto *VoidTy = Type::getVoidTy(M->getContext());
+  auto *F =
+      Function::Create(FunctionType::get(VoidTy, false),
+                       GlobalValue::InternalLinkage, "__llvm_profile_init", M);
+  F->setUnnamedAddr(true);
+  F->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    F->addFnAttr(Attribute::NoRedZone);
+
+  // Add the basic block and the necessary calls.
+  IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F));
+  if (RegisterF)
+    IRB.CreateCall(RegisterF, {});
+  if (!InstrProfileOutput.empty()) {
+    auto *Int8PtrTy = Type::getInt8PtrTy(M->getContext());
+    auto *SetNameTy = FunctionType::get(VoidTy, Int8PtrTy, false);
+    auto *SetNameF =
+        Function::Create(SetNameTy, GlobalValue::ExternalLinkage,
+                         "__llvm_profile_override_default_filename", M);
+
+    // Create variable for profile name.
+    Constant *ProfileNameConst =
+        ConstantDataArray::getString(M->getContext(), InstrProfileOutput, true);
+    GlobalVariable *ProfileName =
+        new GlobalVariable(*M, ProfileNameConst->getType(), true,
+                           GlobalValue::PrivateLinkage, ProfileNameConst);
+
+    IRB.CreateCall(SetNameF, IRB.CreatePointerCast(ProfileName, Int8PtrTy));
+  }
+  IRB.CreateRetVoid();
+
+  appendToGlobalCtors(*M, F, 0);
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
new file mode 100644
index 0000000..2750585
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -0,0 +1,40 @@
+//===-- Instrumentation.cpp - TransformUtils Infrastructure ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common initialization infrastructure for the
+// Instrumentation library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+/// initializeInstrumentation - Initialize all passes in the TransformUtils
+/// library.
+void llvm::initializeInstrumentation(PassRegistry &Registry) {
+  initializeAddressSanitizerPass(Registry);
+  initializeAddressSanitizerModulePass(Registry);
+  initializeBoundsCheckingPass(Registry);
+  initializeGCOVProfilerPass(Registry);
+  initializeInstrProfilingPass(Registry);
+  initializeMemorySanitizerPass(Registry);
+  initializeThreadSanitizerPass(Registry);
+  initializeSanitizerCoverageModulePass(Registry);
+  initializeDataFlowSanitizerPass(Registry);
+  initializeSafeStackPass(Registry);
+}
+
+/// LLVMInitializeInstrumentation - C binding for
+/// initializeInstrumentation.
+void LLVMInitializeInstrumentation(LLVMPassRegistryRef R) {
+  initializeInstrumentation(*unwrap(R));
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h b/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
new file mode 100644
index 0000000..363539b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
@@ -0,0 +1,111 @@
+//===- llvm/Analysis/MaximumSpanningTree.h - Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This module provides means for calculating a maximum spanning tree for a
+// given set of weighted edges. The type parameter T is the type of a node.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H
+#define LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H
+
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/IR/BasicBlock.h"
+#include <algorithm>
+#include <vector>
+
+namespace llvm {
+
+  /// MaximumSpanningTree - A MST implementation.
+  /// The type parameter T determines the type of the nodes of the graph.
+  template <typename T>
+  class MaximumSpanningTree {
+  public:
+    typedef std::pair<const T*, const T*> Edge;
+    typedef std::pair<Edge, double> EdgeWeight;
+    typedef std::vector<EdgeWeight> EdgeWeights;
+  protected:
+    typedef std::vector<Edge> MaxSpanTree;
+
+    MaxSpanTree MST;
+
+  private:
+    // A comparing class for comparing weighted edges.
+    struct EdgeWeightCompare {
+      static bool getBlockSize(const T *X) {
+        const BasicBlock *BB = dyn_cast_or_null<BasicBlock>(X);
+        return BB ? BB->size() : 0;
+      }
+
+      bool operator()(EdgeWeight X, EdgeWeight Y) const {
+        if (X.second > Y.second) return true;
+        if (X.second < Y.second) return false;
+
+        // Equal edge weights: break ties by comparing block sizes.
+        size_t XSizeA = getBlockSize(X.first.first);
+        size_t YSizeA = getBlockSize(Y.first.first);
+        if (XSizeA > YSizeA) return true;
+        if (XSizeA < YSizeA) return false;
+
+        size_t XSizeB = getBlockSize(X.first.second);
+        size_t YSizeB = getBlockSize(Y.first.second);
+        if (XSizeB > YSizeB) return true;
+        if (XSizeB < YSizeB) return false;
+
+        return false;
+      }
+    };
+
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+
+    /// MaximumSpanningTree() - Takes a vector of weighted edges and returns a
+    /// spanning tree.
+    MaximumSpanningTree(EdgeWeights &EdgeVector) {
+
+      std::stable_sort(EdgeVector.begin(), EdgeVector.end(), EdgeWeightCompare());
+
+      // Create spanning tree, Forest contains a special data structure
+      // that makes checking if two nodes are already in a common (sub-)tree
+      // fast and cheap.
+      EquivalenceClasses<const T*> Forest;
+      for (typename EdgeWeights::iterator EWi = EdgeVector.begin(),
+           EWe = EdgeVector.end(); EWi != EWe; ++EWi) {
+        Edge e = (*EWi).first;
+
+        Forest.insert(e.first);
+        Forest.insert(e.second);
+      }
+
+      // Iterate over the sorted edges, biggest first.
+      for (typename EdgeWeights::iterator EWi = EdgeVector.begin(),
+           EWe = EdgeVector.end(); EWi != EWe; ++EWi) {
+        Edge e = (*EWi).first;
+
+        if (Forest.findLeader(e.first) != Forest.findLeader(e.second)) {
+          Forest.unionSets(e.first, e.second);
+          // So we know now that the edge is not already in a subtree, so we push
+          // the edge to the MST.
+          MST.push_back(e);
+        }
+      }
+    }
+
+    typename MaxSpanTree::iterator begin() {
+      return MST.begin();
+    }
+
+    typename MaxSpanTree::iterator end() {
+      return MST.end();
+    }
+  };
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
new file mode 100644
index 0000000..286a563
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -0,0 +1,3026 @@
+//===-- MemorySanitizer.cpp - detector of uninitialized reads -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file is a part of MemorySanitizer, a detector of uninitialized
+/// reads.
+///
+/// The algorithm of the tool is similar to Memcheck
+/// (http://goo.gl/QKbem). We associate a few shadow bits with every
+/// byte of the application memory, poison the shadow of the malloc-ed
+/// or alloca-ed memory, load the shadow bits on every memory read,
+/// propagate the shadow bits through some of the arithmetic
+/// instruction (including MOV), store the shadow bits on every memory
+/// write, report a bug on some other instructions (e.g. JMP) if the
+/// associated shadow is poisoned.
+///
+/// But there are differences too. The first and the major one:
+/// compiler instrumentation instead of binary instrumentation. This
+/// gives us much better register allocation, possible compiler
+/// optimizations and a fast start-up. But this brings the major issue
+/// as well: msan needs to see all program events, including system
+/// calls and reads/writes in system libraries, so we either need to
+/// compile *everything* with msan or use a binary translation
+/// component (e.g. DynamoRIO) to instrument pre-built libraries.
+/// Another difference from Memcheck is that we use 8 shadow bits per
+/// byte of application memory and use a direct shadow mapping. This
+/// greatly simplifies the instrumentation code and avoids races on
+/// shadow updates (Memcheck is single-threaded so races are not a
+/// concern there. Memcheck uses 2 shadow bits per byte with a slow
+/// path storage that uses 8 bits per byte).
+///
+/// The default value of shadow is 0, which means "clean" (not poisoned).
+///
+/// Every module initializer should call __msan_init to ensure that the
+/// shadow memory is ready. On error, __msan_warning is called. Since
+/// parameters and return values may be passed via registers, we have a
+/// specialized thread-local shadow for return values
+/// (__msan_retval_tls) and parameters (__msan_param_tls).
+///
+///                           Origin tracking.
+///
+/// MemorySanitizer can track origins (allocation points) of all uninitialized
+/// values. This behavior is controlled with a flag (msan-track-origins) and is
+/// disabled by default.
+///
+/// Origins are 4-byte values created and interpreted by the runtime library.
+/// They are stored in a second shadow mapping, one 4-byte value for 4 bytes
+/// of application memory. Propagation of origins is basically a bunch of
+/// "select" instructions that pick the origin of a dirty argument, if an
+/// instruction has one.
+///
+/// Every 4 aligned, consecutive bytes of application memory have one origin
+/// value associated with them. If these bytes contain uninitialized data
+/// coming from 2 different allocations, the last store wins. Because of this,
+/// MemorySanitizer reports can show unrelated origins, but this is unlikely in
+/// practice.
+///
+/// Origins are meaningless for fully initialized values, so MemorySanitizer
+/// avoids storing origin to memory when a fully initialized value is stored.
+/// This way it avoids needless overwritting origin of the 4-byte region on
+/// a short (i.e. 1 byte) clean store, and it is also good for performance.
+///
+///                            Atomic handling.
+///
+/// Ideally, every atomic store of application value should update the
+/// corresponding shadow location in an atomic way. Unfortunately, atomic store
+/// of two disjoint locations can not be done without severe slowdown.
+///
+/// Therefore, we implement an approximation that may err on the safe side.
+/// In this implementation, every atomically accessed location in the program
+/// may only change from (partially) uninitialized to fully initialized, but
+/// not the other way around. We load the shadow _after_ the application load,
+/// and we store the shadow _before_ the app store. Also, we always store clean
+/// shadow (if the application store is atomic). This way, if the store-load
+/// pair constitutes a happens-before arc, shadow store and load are correctly
+/// ordered such that the load will get either the value that was stored, or
+/// some later value (which is always clean).
+///
+/// This does not work very well with Compare-And-Swap (CAS) and
+/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW
+/// must store the new shadow before the app operation, and load the shadow
+/// after the app operation. Computers don't work this way. Current
+/// implementation ignores the load aspect of CAS/RMW, always returning a clean
+/// value. It implements the store part as a simple atomic store by storing a
+/// clean shadow.
+
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msan"
+
+static const unsigned kOriginSize = 4;
+static const unsigned kMinOriginAlignment = 4;
+static const unsigned kShadowTLSAlignment = 8;
+
+// These constants must be kept in sync with the ones in msan.h.
+static const unsigned kParamTLSSize = 800;
+static const unsigned kRetvalTLSSize = 800;
+
+// Accesses sizes are powers of two: 1, 2, 4, 8.
+static const size_t kNumberOfAccessSizes = 4;
+
+/// \brief Track origins of uninitialized values.
+///
+/// Adds a section to MemorySanitizer report that points to the allocation
+/// (stack or heap) the uninitialized bits came from originally.
+static cl::opt<int> ClTrackOrigins("msan-track-origins",
+       cl::desc("Track origins (allocation sites) of poisoned memory"),
+       cl::Hidden, cl::init(0));
+static cl::opt<bool> ClKeepGoing("msan-keep-going",
+       cl::desc("keep going after reporting a UMR"),
+       cl::Hidden, cl::init(false));
+static cl::opt<bool> ClPoisonStack("msan-poison-stack",
+       cl::desc("poison uninitialized stack variables"),
+       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",
+       cl::desc("poison uninitialized stack variables with a call"),
+       cl::Hidden, cl::init(false));
+static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern",
+       cl::desc("poison uninitialized stack variables with the given patter"),
+       cl::Hidden, cl::init(0xff));
+static cl::opt<bool> ClPoisonUndef("msan-poison-undef",
+       cl::desc("poison undef temps"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
+       cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
+       cl::desc("exact handling of relational integer ICmp"),
+       cl::Hidden, cl::init(false));
+
+// This flag controls whether we check the shadow of the address
+// operand of load or store. Such bugs are very rare, since load from
+// a garbage address typically results in SEGV, but still happen
+// (e.g. only lower bits of address are garbage, or the access happens
+// early at program startup where malloc-ed memory is more likely to
+// be zeroed. As of 2012-08-28 this flag adds 20% slowdown.
+static cl::opt<bool> ClCheckAccessAddress("msan-check-access-address",
+       cl::desc("report accesses through a pointer which has poisoned shadow"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions",
+       cl::desc("print out instructions with default strict semantics"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<int> ClInstrumentationWithCallThreshold(
+    "msan-instrumentation-with-call-threshold",
+    cl::desc(
+        "If the function being instrumented requires more than "
+        "this number of checks and origin stores, use callbacks instead of "
+        "inline checks (-1 means never use callbacks)."),
+    cl::Hidden, cl::init(3500));
+
+// This is an experiment to enable handling of cases where shadow is a non-zero
+// compile-time constant. For some unexplainable reason they were silently
+// ignored in the instrumentation.
+static cl::opt<bool> ClCheckConstantShadow("msan-check-constant-shadow",
+       cl::desc("Insert checks for constant shadow values"),
+       cl::Hidden, cl::init(false));
+
+static const char *const kMsanModuleCtorName = "msan.module_ctor";
+static const char *const kMsanInitName = "__msan_init";
+
+namespace {
+
+// Memory map parameters used in application-to-shadow address calculation.
+// Offset = (Addr & ~AndMask) ^ XorMask
+// Shadow = ShadowBase + Offset
+// Origin = OriginBase + Offset
+struct MemoryMapParams {
+  uint64_t AndMask;
+  uint64_t XorMask;
+  uint64_t ShadowBase;
+  uint64_t OriginBase;
+};
+
+struct PlatformMemoryMapParams {
+  const MemoryMapParams *bits32;
+  const MemoryMapParams *bits64;
+};
+
+// i386 Linux
+static const MemoryMapParams Linux_I386_MemoryMapParams = {
+  0x000080000000,  // AndMask
+  0,               // XorMask (not used)
+  0,               // ShadowBase (not used)
+  0x000040000000,  // OriginBase
+};
+
+// x86_64 Linux
+static const MemoryMapParams Linux_X86_64_MemoryMapParams = {
+  0x400000000000,  // AndMask
+  0,               // XorMask (not used)
+  0,               // ShadowBase (not used)
+  0x200000000000,  // OriginBase
+};
+
+// mips64 Linux
+static const MemoryMapParams Linux_MIPS64_MemoryMapParams = {
+  0x004000000000,  // AndMask
+  0,               // XorMask (not used)
+  0,               // ShadowBase (not used)
+  0x002000000000,  // OriginBase
+};
+
+// ppc64 Linux
+static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = {
+  0x200000000000,  // AndMask
+  0x100000000000,  // XorMask
+  0x080000000000,  // ShadowBase
+  0x1C0000000000,  // OriginBase
+};
+
+// i386 FreeBSD
+static const MemoryMapParams FreeBSD_I386_MemoryMapParams = {
+  0x000180000000,  // AndMask
+  0x000040000000,  // XorMask
+  0x000020000000,  // ShadowBase
+  0x000700000000,  // OriginBase
+};
+
+// x86_64 FreeBSD
+static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = {
+  0xc00000000000,  // AndMask
+  0x200000000000,  // XorMask
+  0x100000000000,  // ShadowBase
+  0x380000000000,  // OriginBase
+};
+
+static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = {
+  &Linux_I386_MemoryMapParams,
+  &Linux_X86_64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = {
+  NULL,
+  &Linux_MIPS64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = {
+  NULL,
+  &Linux_PowerPC64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = {
+  &FreeBSD_I386_MemoryMapParams,
+  &FreeBSD_X86_64_MemoryMapParams,
+};
+
+/// \brief An instrumentation pass implementing detection of uninitialized
+/// reads.
+///
+/// MemorySanitizer: instrument the code in module to find
+/// uninitialized reads.
+class MemorySanitizer : public FunctionPass {
+ public:
+  MemorySanitizer(int TrackOrigins = 0)
+      : FunctionPass(ID),
+        TrackOrigins(std::max(TrackOrigins, (int)ClTrackOrigins)),
+        WarningFn(nullptr) {}
+  const char *getPassName() const override { return "MemorySanitizer"; }
+  bool runOnFunction(Function &F) override;
+  bool doInitialization(Module &M) override;
+  static char ID;  // Pass identification, replacement for typeid.
+
+ private:
+  void initializeCallbacks(Module &M);
+
+  /// \brief Track origins (allocation points) of uninitialized values.
+  int TrackOrigins;
+
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *OriginTy;
+  /// \brief Thread-local shadow storage for function parameters.
+  GlobalVariable *ParamTLS;
+  /// \brief Thread-local origin storage for function parameters.
+  GlobalVariable *ParamOriginTLS;
+  /// \brief Thread-local shadow storage for function return value.
+  GlobalVariable *RetvalTLS;
+  /// \brief Thread-local origin storage for function return value.
+  GlobalVariable *RetvalOriginTLS;
+  /// \brief Thread-local shadow storage for in-register va_arg function
+  /// parameters (x86_64-specific).
+  GlobalVariable *VAArgTLS;
+  /// \brief Thread-local shadow storage for va_arg overflow area
+  /// (x86_64-specific).
+  GlobalVariable *VAArgOverflowSizeTLS;
+  /// \brief Thread-local space used to pass origin value to the UMR reporting
+  /// function.
+  GlobalVariable *OriginTLS;
+
+  /// \brief The run-time callback to print a warning.
+  Value *WarningFn;
+  // These arrays are indexed by log2(AccessSize).
+  Value *MaybeWarningFn[kNumberOfAccessSizes];
+  Value *MaybeStoreOriginFn[kNumberOfAccessSizes];
+
+  /// \brief Run-time helper that generates a new origin value for a stack
+  /// allocation.
+  Value *MsanSetAllocaOrigin4Fn;
+  /// \brief Run-time helper that poisons stack on function entry.
+  Value *MsanPoisonStackFn;
+  /// \brief Run-time helper that records a store (or any event) of an
+  /// uninitialized value and returns an updated origin id encoding this info.
+  Value *MsanChainOriginFn;
+  /// \brief MSan runtime replacements for memmove, memcpy and memset.
+  Value *MemmoveFn, *MemcpyFn, *MemsetFn;
+
+  /// \brief Memory map parameters used in application-to-shadow calculation.
+  const MemoryMapParams *MapParams;
+
+  MDNode *ColdCallWeights;
+  /// \brief Branch weights for origin store.
+  MDNode *OriginStoreWeights;
+  /// \brief An empty volatile inline asm that prevents callback merge.
+  InlineAsm *EmptyAsm;
+  Function *MsanCtorFunction;
+
+  friend struct MemorySanitizerVisitor;
+  friend struct VarArgAMD64Helper;
+  friend struct VarArgMIPS64Helper;
+};
+}  // namespace
+
+char MemorySanitizer::ID = 0;
+INITIALIZE_PASS(MemorySanitizer, "msan",
+                "MemorySanitizer: detects uninitialized reads.",
+                false, false)
+
+FunctionPass *llvm::createMemorySanitizerPass(int TrackOrigins) {
+  return new MemorySanitizer(TrackOrigins);
+}
+
+/// \brief Create a non-const global initialized with the given string.
+///
+/// Creates a writable global for Str so that we can pass it to the
+/// run-time lib. Runtime uses first 4 bytes of the string to store the
+/// frame ID, so the string needs to be mutable.
+static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,
+                                                            StringRef Str) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  return new GlobalVariable(M, StrConst->getType(), /*isConstant=*/false,
+                            GlobalValue::PrivateLinkage, StrConst, "");
+}
+
+
+/// \brief Insert extern declaration of runtime-provided functions and globals.
+void MemorySanitizer::initializeCallbacks(Module &M) {
+  // Only do this once.
+  if (WarningFn)
+    return;
+
+  IRBuilder<> IRB(*C);
+  // Create the callback.
+  // FIXME: this function should have "Cold" calling conv,
+  // which is not yet implemented.
+  StringRef WarningFnName = ClKeepGoing ? "__msan_warning"
+                                        : "__msan_warning_noreturn";
+  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), nullptr);
+
+  for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+       AccessSizeIndex++) {
+    unsigned AccessSize = 1 << AccessSizeIndex;
+    std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize);
+    MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
+        FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
+        IRB.getInt32Ty(), nullptr);
+
+    FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize);
+    MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
+        FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
+        IRB.getInt8PtrTy(), IRB.getInt32Ty(), nullptr);
+  }
+
+  MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
+    "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
+    IRB.getInt8PtrTy(), IntptrTy, nullptr);
+  MsanPoisonStackFn =
+      M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IntptrTy, nullptr);
+  MsanChainOriginFn = M.getOrInsertFunction(
+    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty(), nullptr);
+  MemmoveFn = M.getOrInsertFunction(
+    "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy, nullptr);
+  MemcpyFn = M.getOrInsertFunction(
+    "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IntptrTy, nullptr);
+  MemsetFn = M.getOrInsertFunction(
+    "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
+    IntptrTy, nullptr);
+
+  // Create globals.
+  RetvalTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8), false,
+    GlobalVariable::ExternalLinkage, nullptr, "__msan_retval_tls", nullptr,
+    GlobalVariable::InitialExecTLSModel);
+  RetvalOriginTLS = new GlobalVariable(
+    M, OriginTy, false, GlobalVariable::ExternalLinkage, nullptr,
+    "__msan_retval_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
+
+  ParamTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
+    GlobalVariable::ExternalLinkage, nullptr, "__msan_param_tls", nullptr,
+    GlobalVariable::InitialExecTLSModel);
+  ParamOriginTLS = new GlobalVariable(
+    M, ArrayType::get(OriginTy, kParamTLSSize / 4), false,
+    GlobalVariable::ExternalLinkage, nullptr, "__msan_param_origin_tls",
+    nullptr, GlobalVariable::InitialExecTLSModel);
+
+  VAArgTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), false,
+    GlobalVariable::ExternalLinkage, nullptr, "__msan_va_arg_tls", nullptr,
+    GlobalVariable::InitialExecTLSModel);
+  VAArgOverflowSizeTLS = new GlobalVariable(
+    M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
+    "__msan_va_arg_overflow_size_tls", nullptr,
+    GlobalVariable::InitialExecTLSModel);
+  OriginTLS = new GlobalVariable(
+    M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, nullptr,
+    "__msan_origin_tls", nullptr, GlobalVariable::InitialExecTLSModel);
+
+  // We insert an empty inline asm after __msan_report* to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+}
+
+/// \brief Module-level initialization.
+///
+/// inserts a call to __msan_init to the module's constructor list.
+bool MemorySanitizer::doInitialization(Module &M) {
+  auto &DL = M.getDataLayout();
+
+  Triple TargetTriple(M.getTargetTriple());
+  switch (TargetTriple.getOS()) {
+    case Triple::FreeBSD:
+      switch (TargetTriple.getArch()) {
+        case Triple::x86_64:
+          MapParams = FreeBSD_X86_MemoryMapParams.bits64;
+          break;
+        case Triple::x86:
+          MapParams = FreeBSD_X86_MemoryMapParams.bits32;
+          break;
+        default:
+          report_fatal_error("unsupported architecture");
+      }
+      break;
+    case Triple::Linux:
+      switch (TargetTriple.getArch()) {
+        case Triple::x86_64:
+          MapParams = Linux_X86_MemoryMapParams.bits64;
+          break;
+        case Triple::x86:
+          MapParams = Linux_X86_MemoryMapParams.bits32;
+          break;
+        case Triple::mips64:
+        case Triple::mips64el:
+          MapParams = Linux_MIPS_MemoryMapParams.bits64;
+          break;
+        case Triple::ppc64:
+        case Triple::ppc64le:
+          MapParams = Linux_PowerPC_MemoryMapParams.bits64;
+          break;
+        default:
+          report_fatal_error("unsupported architecture");
+      }
+      break;
+    default:
+      report_fatal_error("unsupported operating system");
+  }
+
+  C = &(M.getContext());
+  IRBuilder<> IRB(*C);
+  IntptrTy = IRB.getIntPtrTy(DL);
+  OriginTy = IRB.getInt32Ty();
+
+  ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+  OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+
+  std::tie(MsanCtorFunction, std::ignore) =
+      createSanitizerCtorAndInitFunctions(M, kMsanModuleCtorName, kMsanInitName,
+                                          /*InitArgTypes=*/{},
+                                          /*InitArgs=*/{});
+
+  appendToGlobalCtors(M, MsanCtorFunction, 0);
+
+  if (TrackOrigins)
+    new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                       IRB.getInt32(TrackOrigins), "__msan_track_origins");
+
+  if (ClKeepGoing)
+    new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                       IRB.getInt32(ClKeepGoing), "__msan_keep_going");
+
+  return true;
+}
+
+namespace {
+
+/// \brief A helper class that handles instrumentation of VarArg
+/// functions on a particular platform.
+///
+/// Implementations are expected to insert the instrumentation
+/// necessary to propagate argument shadow through VarArg function
+/// calls. Visit* methods are called during an InstVisitor pass over
+/// the function, and should avoid creating new basic blocks. A new
+/// instance of this class is created for each instrumented function.
+struct VarArgHelper {
+  /// \brief Visit a CallSite.
+  virtual void visitCallSite(CallSite &CS, IRBuilder<> &IRB) = 0;
+
+  /// \brief Visit a va_start call.
+  virtual void visitVAStartInst(VAStartInst &I) = 0;
+
+  /// \brief Visit a va_copy call.
+  virtual void visitVACopyInst(VACopyInst &I) = 0;
+
+  /// \brief Finalize function instrumentation.
+  ///
+  /// This method is called after visiting all interesting (see above)
+  /// instructions in a function.
+  virtual void finalizeInstrumentation() = 0;
+
+  virtual ~VarArgHelper() {}
+};
+
+struct MemorySanitizerVisitor;
+
+VarArgHelper*
+CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                   MemorySanitizerVisitor &Visitor);
+
+unsigned TypeSizeToSizeIndex(unsigned TypeSize) {
+  if (TypeSize <= 8) return 0;
+  return Log2_32_Ceil(TypeSize / 8);
+}
+
+/// This class does all the work for a given function. Store and Load
+/// instructions store and load corresponding shadow and origin
+/// values. Most instructions propagate shadow from arguments to their
+/// return values. Certain instructions (most importantly, BranchInst)
+/// test their argument shadow and print reports (with a runtime call) if it's
+/// non-zero.
+struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
+  Function &F;
+  MemorySanitizer &MS;
+  SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
+  ValueMap<Value*, Value*> ShadowMap, OriginMap;
+  std::unique_ptr<VarArgHelper> VAHelper;
+
+  // The following flags disable parts of MSan instrumentation based on
+  // blacklist contents and command-line options.
+  bool InsertChecks;
+  bool PropagateShadow;
+  bool PoisonStack;
+  bool PoisonUndef;
+  bool CheckReturnValue;
+
+  struct ShadowOriginAndInsertPoint {
+    Value *Shadow;
+    Value *Origin;
+    Instruction *OrigIns;
+    ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I)
+      : Shadow(S), Origin(O), OrigIns(I) { }
+  };
+  SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
+  SmallVector<Instruction*, 16> StoreList;
+
+  MemorySanitizerVisitor(Function &F, MemorySanitizer &MS)
+      : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) {
+    bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory);
+    InsertChecks = SanitizeFunction;
+    PropagateShadow = SanitizeFunction;
+    PoisonStack = SanitizeFunction && ClPoisonStack;
+    PoisonUndef = SanitizeFunction && ClPoisonUndef;
+    // FIXME: Consider using SpecialCaseList to specify a list of functions that
+    // must always return fully initialized values. For now, we hardcode "main".
+    CheckReturnValue = SanitizeFunction && (F.getName() == "main");
+
+    DEBUG(if (!InsertChecks)
+          dbgs() << "MemorySanitizer is not inserting checks into '"
+                 << F.getName() << "'\n");
+  }
+
+  Value *updateOrigin(Value *V, IRBuilder<> &IRB) {
+    if (MS.TrackOrigins <= 1) return V;
+    return IRB.CreateCall(MS.MsanChainOriginFn, V);
+  }
+
+  Value *originToIntptr(IRBuilder<> &IRB, Value *Origin) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
+    if (IntptrSize == kOriginSize) return Origin;
+    assert(IntptrSize == kOriginSize * 2);
+    Origin = IRB.CreateIntCast(Origin, MS.IntptrTy, /* isSigned */ false);
+    return IRB.CreateOr(Origin, IRB.CreateShl(Origin, kOriginSize * 8));
+  }
+
+  /// \brief Fill memory range with the given origin value.
+  void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
+                   unsigned Size, unsigned Alignment) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned IntptrAlignment = DL.getABITypeAlignment(MS.IntptrTy);
+    unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
+    assert(IntptrAlignment >= kMinOriginAlignment);
+    assert(IntptrSize >= kOriginSize);
+
+    unsigned Ofs = 0;
+    unsigned CurrentAlignment = Alignment;
+    if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) {
+      Value *IntptrOrigin = originToIntptr(IRB, Origin);
+      Value *IntptrOriginPtr =
+          IRB.CreatePointerCast(OriginPtr, PointerType::get(MS.IntptrTy, 0));
+      for (unsigned i = 0; i < Size / IntptrSize; ++i) {
+        Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i)
+                       : IntptrOriginPtr;
+        IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment);
+        Ofs += IntptrSize / kOriginSize;
+        CurrentAlignment = IntptrAlignment;
+      }
+    }
+
+    for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) {
+      Value *GEP =
+          i ? IRB.CreateConstGEP1_32(nullptr, OriginPtr, i) : OriginPtr;
+      IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment);
+      CurrentAlignment = kMinOriginAlignment;
+    }
+  }
+
+  void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin,
+                   unsigned Alignment, bool AsCall) {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+    unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
+    if (isa<StructType>(Shadow->getType())) {
+      paintOrigin(IRB, updateOrigin(Origin, IRB),
+                  getOriginPtr(Addr, IRB, Alignment), StoreSize,
+                  OriginAlignment);
+    } else {
+      Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+      Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow);
+      if (ConstantShadow) {
+        if (ClCheckConstantShadow && !ConstantShadow->isZeroValue())
+          paintOrigin(IRB, updateOrigin(Origin, IRB),
+                      getOriginPtr(Addr, IRB, Alignment), StoreSize,
+                      OriginAlignment);
+        return;
+      }
+
+      unsigned TypeSizeInBits =
+          DL.getTypeSizeInBits(ConvertedShadow->getType());
+      unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
+      if (AsCall && SizeIndex < kNumberOfAccessSizes) {
+        Value *Fn = MS.MaybeStoreOriginFn[SizeIndex];
+        Value *ConvertedShadow2 = IRB.CreateZExt(
+            ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
+        IRB.CreateCall(Fn, {ConvertedShadow2,
+                            IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                            Origin});
+      } else {
+        Value *Cmp = IRB.CreateICmpNE(
+            ConvertedShadow, getCleanShadow(ConvertedShadow), "_mscmp");
+        Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+            Cmp, IRB.GetInsertPoint(), false, MS.OriginStoreWeights);
+        IRBuilder<> IRBNew(CheckTerm);
+        paintOrigin(IRBNew, updateOrigin(Origin, IRBNew),
+                    getOriginPtr(Addr, IRBNew, Alignment), StoreSize,
+                    OriginAlignment);
+      }
+    }
+  }
+
+  void materializeStores(bool InstrumentWithCalls) {
+    for (auto Inst : StoreList) {
+      StoreInst &SI = *dyn_cast<StoreInst>(Inst);
+
+      IRBuilder<> IRB(&SI);
+      Value *Val = SI.getValueOperand();
+      Value *Addr = SI.getPointerOperand();
+      Value *Shadow = SI.isAtomic() ? getCleanShadow(Val) : getShadow(Val);
+      Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
+
+      StoreInst *NewSI =
+          IRB.CreateAlignedStore(Shadow, ShadowPtr, SI.getAlignment());
+      DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
+      (void)NewSI;
+
+      if (ClCheckAccessAddress) insertShadowCheck(Addr, &SI);
+
+      if (SI.isAtomic()) SI.setOrdering(addReleaseOrdering(SI.getOrdering()));
+
+      if (MS.TrackOrigins && !SI.isAtomic())
+        storeOrigin(IRB, Addr, Shadow, getOrigin(Val), SI.getAlignment(),
+                    InstrumentWithCalls);
+    }
+  }
+
+  void materializeOneCheck(Instruction *OrigIns, Value *Shadow, Value *Origin,
+                           bool AsCall) {
+    IRBuilder<> IRB(OrigIns);
+    DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
+    Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+    DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
+
+    Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow);
+    if (ConstantShadow) {
+      if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) {
+        if (MS.TrackOrigins) {
+          IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0),
+                          MS.OriginTLS);
+        }
+        IRB.CreateCall(MS.WarningFn, {});
+        IRB.CreateCall(MS.EmptyAsm, {});
+        // FIXME: Insert UnreachableInst if !ClKeepGoing?
+        // This may invalidate some of the following checks and needs to be done
+        // at the very end.
+      }
+      return;
+    }
+
+    const DataLayout &DL = OrigIns->getModule()->getDataLayout();
+
+    unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType());
+    unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
+    if (AsCall && SizeIndex < kNumberOfAccessSizes) {
+      Value *Fn = MS.MaybeWarningFn[SizeIndex];
+      Value *ConvertedShadow2 =
+          IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
+      IRB.CreateCall(Fn, {ConvertedShadow2, MS.TrackOrigins && Origin
+                                                ? Origin
+                                                : (Value *)IRB.getInt32(0)});
+    } else {
+      Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
+                                    getCleanShadow(ConvertedShadow), "_mscmp");
+      Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+          Cmp, OrigIns,
+          /* Unreachable */ !ClKeepGoing, MS.ColdCallWeights);
+
+      IRB.SetInsertPoint(CheckTerm);
+      if (MS.TrackOrigins) {
+        IRB.CreateStore(Origin ? (Value *)Origin : (Value *)IRB.getInt32(0),
+                        MS.OriginTLS);
+      }
+      IRB.CreateCall(MS.WarningFn, {});
+      IRB.CreateCall(MS.EmptyAsm, {});
+      DEBUG(dbgs() << "  CHECK: " << *Cmp << "\n");
+    }
+  }
+
+  void materializeChecks(bool InstrumentWithCalls) {
+    for (const auto &ShadowData : InstrumentationList) {
+      Instruction *OrigIns = ShadowData.OrigIns;
+      Value *Shadow = ShadowData.Shadow;
+      Value *Origin = ShadowData.Origin;
+      materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls);
+    }
+    DEBUG(dbgs() << "DONE:\n" << F);
+  }
+
+  /// \brief Add MemorySanitizer instrumentation to a function.
+  bool runOnFunction() {
+    MS.initializeCallbacks(*F.getParent());
+
+    // In the presence of unreachable blocks, we may see Phi nodes with
+    // incoming nodes from such blocks. Since InstVisitor skips unreachable
+    // blocks, such nodes will not have any shadow value associated with them.
+    // It's easier to remove unreachable blocks than deal with missing shadow.
+    removeUnreachableBlocks(F);
+
+    // Iterate all BBs in depth-first order and create shadow instructions
+    // for all instructions (where applicable).
+    // For PHI nodes we create dummy shadow PHIs which will be finalized later.
+    for (BasicBlock *BB : depth_first(&F.getEntryBlock()))
+      visit(*BB);
+
+
+    // Finalize PHI nodes.
+    for (PHINode *PN : ShadowPHINodes) {
+      PHINode *PNS = cast<PHINode>(getShadow(PN));
+      PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : nullptr;
+      size_t NumValues = PN->getNumIncomingValues();
+      for (size_t v = 0; v < NumValues; v++) {
+        PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v));
+        if (PNO) PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v));
+      }
+    }
+
+    VAHelper->finalizeInstrumentation();
+
+    bool InstrumentWithCalls = ClInstrumentationWithCallThreshold >= 0 &&
+                               InstrumentationList.size() + StoreList.size() >
+                                   (unsigned)ClInstrumentationWithCallThreshold;
+
+    // Delayed instrumentation of StoreInst.
+    // This may add new checks to be inserted later.
+    materializeStores(InstrumentWithCalls);
+
+    // Insert shadow value checks.
+    materializeChecks(InstrumentWithCalls);
+
+    return true;
+  }
+
+  /// \brief Compute the shadow type that corresponds to a given Value.
+  Type *getShadowTy(Value *V) {
+    return getShadowTy(V->getType());
+  }
+
+  /// \brief Compute the shadow type that corresponds to a given Type.
+  Type *getShadowTy(Type *OrigTy) {
+    if (!OrigTy->isSized()) {
+      return nullptr;
+    }
+    // For integer type, shadow is the same as the original type.
+    // This may return weird-sized types like i1.
+    if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
+      return IT;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
+      uint32_t EltSize = DL.getTypeSizeInBits(VT->getElementType());
+      return VectorType::get(IntegerType::get(*MS.C, EltSize),
+                             VT->getNumElements());
+    }
+    if (ArrayType *AT = dyn_cast<ArrayType>(OrigTy)) {
+      return ArrayType::get(getShadowTy(AT->getElementType()),
+                            AT->getNumElements());
+    }
+    if (StructType *ST = dyn_cast<StructType>(OrigTy)) {
+      SmallVector<Type*, 4> Elements;
+      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+        Elements.push_back(getShadowTy(ST->getElementType(i)));
+      StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked());
+      DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
+      return Res;
+    }
+    uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);
+    return IntegerType::get(*MS.C, TypeSize);
+  }
+
+  /// \brief Flatten a vector type.
+  Type *getShadowTyNoVec(Type *ty) {
+    if (VectorType *vt = dyn_cast<VectorType>(ty))
+      return IntegerType::get(*MS.C, vt->getBitWidth());
+    return ty;
+  }
+
+  /// \brief Convert a shadow value to it's flattened variant.
+  Value *convertToShadowTyNoVec(Value *V, IRBuilder<> &IRB) {
+    Type *Ty = V->getType();
+    Type *NoVecTy = getShadowTyNoVec(Ty);
+    if (Ty == NoVecTy) return V;
+    return IRB.CreateBitCast(V, NoVecTy);
+  }
+
+  /// \brief Compute the integer shadow offset that corresponds to a given
+  /// application address.
+  ///
+  /// Offset = (Addr & ~AndMask) ^ XorMask
+  Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) {
+    uint64_t AndMask = MS.MapParams->AndMask;
+    assert(AndMask != 0 && "AndMask shall be specified");
+    Value *OffsetLong =
+      IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy),
+                    ConstantInt::get(MS.IntptrTy, ~AndMask));
+
+    uint64_t XorMask = MS.MapParams->XorMask;
+    if (XorMask != 0)
+      OffsetLong = IRB.CreateXor(OffsetLong,
+                                 ConstantInt::get(MS.IntptrTy, XorMask));
+    return OffsetLong;
+  }
+
+  /// \brief Compute the shadow address that corresponds to a given application
+  /// address.
+  ///
+  /// Shadow = ShadowBase + Offset
+  Value *getShadowPtr(Value *Addr, Type *ShadowTy,
+                      IRBuilder<> &IRB) {
+    Value *ShadowLong = getShadowPtrOffset(Addr, IRB);
+    uint64_t ShadowBase = MS.MapParams->ShadowBase;
+    if (ShadowBase != 0)
+      ShadowLong =
+        IRB.CreateAdd(ShadowLong,
+                      ConstantInt::get(MS.IntptrTy, ShadowBase));
+    return IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0));
+  }
+
+  /// \brief Compute the origin address that corresponds to a given application
+  /// address.
+  ///
+  /// OriginAddr = (OriginBase + Offset) & ~3ULL
+  Value *getOriginPtr(Value *Addr, IRBuilder<> &IRB, unsigned Alignment) {
+    Value *OriginLong = getShadowPtrOffset(Addr, IRB);
+    uint64_t OriginBase = MS.MapParams->OriginBase;
+    if (OriginBase != 0)
+      OriginLong =
+        IRB.CreateAdd(OriginLong,
+                      ConstantInt::get(MS.IntptrTy, OriginBase));
+    if (Alignment < kMinOriginAlignment) {
+      uint64_t Mask = kMinOriginAlignment - 1;
+      OriginLong = IRB.CreateAnd(OriginLong,
+                                 ConstantInt::get(MS.IntptrTy, ~Mask));
+    }
+    return IRB.CreateIntToPtr(OriginLong,
+                              PointerType::get(IRB.getInt32Ty(), 0));
+  }
+
+  /// \brief Compute the shadow address for a given function argument.
+  ///
+  /// Shadow = ParamTLS+ArgOffset.
+  Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+                              "_msarg");
+  }
+
+  /// \brief Compute the origin address for a given function argument.
+  Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    if (!MS.TrackOrigins) return nullptr;
+    Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+                              "_msarg_o");
+  }
+
+  /// \brief Compute the shadow address for a retval.
+  Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) {
+    Value *Base = IRB.CreatePointerCast(MS.RetvalTLS, MS.IntptrTy);
+    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+                              "_msret");
+  }
+
+  /// \brief Compute the origin address for a retval.
+  Value *getOriginPtrForRetval(IRBuilder<> &IRB) {
+    // We keep a single origin for the entire retval. Might be too optimistic.
+    return MS.RetvalOriginTLS;
+  }
+
+  /// \brief Set SV to be the shadow value for V.
+  void setShadow(Value *V, Value *SV) {
+    assert(!ShadowMap.count(V) && "Values may only have one shadow");
+    ShadowMap[V] = PropagateShadow ? SV : getCleanShadow(V);
+  }
+
+  /// \brief Set Origin to be the origin value for V.
+  void setOrigin(Value *V, Value *Origin) {
+    if (!MS.TrackOrigins) return;
+    assert(!OriginMap.count(V) && "Values may only have one origin");
+    DEBUG(dbgs() << "ORIGIN: " << *V << "  ==> " << *Origin << "\n");
+    OriginMap[V] = Origin;
+  }
+
+  /// \brief Create a clean shadow value for a given value.
+  ///
+  /// Clean shadow (all zeroes) means all bits of the value are defined
+  /// (initialized).
+  Constant *getCleanShadow(Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (!ShadowTy)
+      return nullptr;
+    return Constant::getNullValue(ShadowTy);
+  }
+
+  /// \brief Create a dirty shadow of a given shadow type.
+  Constant *getPoisonedShadow(Type *ShadowTy) {
+    assert(ShadowTy);
+    if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy))
+      return Constant::getAllOnesValue(ShadowTy);
+    if (ArrayType *AT = dyn_cast<ArrayType>(ShadowTy)) {
+      SmallVector<Constant *, 4> Vals(AT->getNumElements(),
+                                      getPoisonedShadow(AT->getElementType()));
+      return ConstantArray::get(AT, Vals);
+    }
+    if (StructType *ST = dyn_cast<StructType>(ShadowTy)) {
+      SmallVector<Constant *, 4> Vals;
+      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+        Vals.push_back(getPoisonedShadow(ST->getElementType(i)));
+      return ConstantStruct::get(ST, Vals);
+    }
+    llvm_unreachable("Unexpected shadow type");
+  }
+
+  /// \brief Create a dirty shadow for a given value.
+  Constant *getPoisonedShadow(Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (!ShadowTy)
+      return nullptr;
+    return getPoisonedShadow(ShadowTy);
+  }
+
+  /// \brief Create a clean (zero) origin.
+  Value *getCleanOrigin() {
+    return Constant::getNullValue(MS.OriginTy);
+  }
+
+  /// \brief Get the shadow value for a given Value.
+  ///
+  /// This function either returns the value set earlier with setShadow,
+  /// or extracts if from ParamTLS (for function arguments).
+  Value *getShadow(Value *V) {
+    if (!PropagateShadow) return getCleanShadow(V);
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      // For instructions the shadow is already stored in the map.
+      Value *Shadow = ShadowMap[V];
+      if (!Shadow) {
+        DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
+        (void)I;
+        assert(Shadow && "No shadow for a value");
+      }
+      return Shadow;
+    }
+    if (UndefValue *U = dyn_cast<UndefValue>(V)) {
+      Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
+      DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
+      (void)U;
+      return AllOnes;
+    }
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      // For arguments we compute the shadow on demand and store it in the map.
+      Value **ShadowPtr = &ShadowMap[V];
+      if (*ShadowPtr)
+        return *ShadowPtr;
+      Function *F = A->getParent();
+      IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI());
+      unsigned ArgOffset = 0;
+      const DataLayout &DL = F->getParent()->getDataLayout();
+      for (auto &FArg : F->args()) {
+        if (!FArg.getType()->isSized()) {
+          DEBUG(dbgs() << "Arg is not sized\n");
+          continue;
+        }
+        unsigned Size =
+            FArg.hasByValAttr()
+                ? DL.getTypeAllocSize(FArg.getType()->getPointerElementType())
+                : DL.getTypeAllocSize(FArg.getType());
+        if (A == &FArg) {
+          bool Overflow = ArgOffset + Size > kParamTLSSize;
+          Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
+          if (FArg.hasByValAttr()) {
+            // ByVal pointer itself has clean shadow. We copy the actual
+            // argument shadow to the underlying memory.
+            // Figure out maximal valid memcpy alignment.
+            unsigned ArgAlign = FArg.getParamAlignment();
+            if (ArgAlign == 0) {
+              Type *EltType = A->getType()->getPointerElementType();
+              ArgAlign = DL.getABITypeAlignment(EltType);
+            }
+            if (Overflow) {
+              // ParamTLS overflow.
+              EntryIRB.CreateMemSet(
+                  getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB),
+                  Constant::getNullValue(EntryIRB.getInt8Ty()), Size, ArgAlign);
+            } else {
+              unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
+              Value *Cpy = EntryIRB.CreateMemCpy(
+                  getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB), Base, Size,
+                  CopyAlign);
+              DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
+              (void)Cpy;
+            }
+            *ShadowPtr = getCleanShadow(V);
+          } else {
+            if (Overflow) {
+              // ParamTLS overflow.
+              *ShadowPtr = getCleanShadow(V);
+            } else {
+              *ShadowPtr =
+                  EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment);
+            }
+          }
+          DEBUG(dbgs() << "  ARG:    "  << FArg << " ==> " <<
+                **ShadowPtr << "\n");
+          if (MS.TrackOrigins && !Overflow) {
+            Value *OriginPtr =
+                getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
+            setOrigin(A, EntryIRB.CreateLoad(OriginPtr));
+          } else {
+            setOrigin(A, getCleanOrigin());
+          }
+        }
+        ArgOffset += RoundUpToAlignment(Size, kShadowTLSAlignment);
+      }
+      assert(*ShadowPtr && "Could not find shadow for an argument");
+      return *ShadowPtr;
+    }
+    // For everything else the shadow is zero.
+    return getCleanShadow(V);
+  }
+
+  /// \brief Get the shadow for i-th argument of the instruction I.
+  Value *getShadow(Instruction *I, int i) {
+    return getShadow(I->getOperand(i));
+  }
+
+  /// \brief Get the origin for a value.
+  Value *getOrigin(Value *V) {
+    if (!MS.TrackOrigins) return nullptr;
+    if (!PropagateShadow) return getCleanOrigin();
+    if (isa<Constant>(V)) return getCleanOrigin();
+    assert((isa<Instruction>(V) || isa<Argument>(V)) &&
+           "Unexpected value type in getOrigin()");
+    Value *Origin = OriginMap[V];
+    assert(Origin && "Missing origin");
+    return Origin;
+  }
+
+  /// \brief Get the origin for i-th argument of the instruction I.
+  Value *getOrigin(Instruction *I, int i) {
+    return getOrigin(I->getOperand(i));
+  }
+
+  /// \brief Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the shadow value is not 0.
+  void insertShadowCheck(Value *Shadow, Value *Origin, Instruction *OrigIns) {
+    assert(Shadow);
+    if (!InsertChecks) return;
+#ifndef NDEBUG
+    Type *ShadowTy = Shadow->getType();
+    assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) &&
+           "Can only insert checks for integer and vector shadow types");
+#endif
+    InstrumentationList.push_back(
+        ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+  }
+
+  /// \brief Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the value is not fully defined.
+  void insertShadowCheck(Value *Val, Instruction *OrigIns) {
+    assert(Val);
+    Value *Shadow, *Origin;
+    if (ClCheckConstantShadow) {
+      Shadow = getShadow(Val);
+      if (!Shadow) return;
+      Origin = getOrigin(Val);
+    } else {
+      Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
+      if (!Shadow) return;
+      Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+    }
+    insertShadowCheck(Shadow, Origin, OrigIns);
+  }
+
+  AtomicOrdering addReleaseOrdering(AtomicOrdering a) {
+    switch (a) {
+      case NotAtomic:
+        return NotAtomic;
+      case Unordered:
+      case Monotonic:
+      case Release:
+        return Release;
+      case Acquire:
+      case AcquireRelease:
+        return AcquireRelease;
+      case SequentiallyConsistent:
+        return SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
+  }
+
+  AtomicOrdering addAcquireOrdering(AtomicOrdering a) {
+    switch (a) {
+      case NotAtomic:
+        return NotAtomic;
+      case Unordered:
+      case Monotonic:
+      case Acquire:
+        return Acquire;
+      case Release:
+      case AcquireRelease:
+        return AcquireRelease;
+      case SequentiallyConsistent:
+        return SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
+  }
+
+  // ------------------- Visitors.
+
+  /// \brief Instrument LoadInst
+  ///
+  /// Loads the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the load address is fully defined.
+  void visitLoadInst(LoadInst &I) {
+    assert(I.getType()->isSized() && "Load type must have size");
+    IRBuilder<> IRB(I.getNextNode());
+    Type *ShadowTy = getShadowTy(&I);
+    Value *Addr = I.getPointerOperand();
+    if (PropagateShadow && !I.getMetadata("nosanitize")) {
+      Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB);
+      setShadow(&I,
+                IRB.CreateAlignedLoad(ShadowPtr, I.getAlignment(), "_msld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(I.getPointerOperand(), &I);
+
+    if (I.isAtomic())
+      I.setOrdering(addAcquireOrdering(I.getOrdering()));
+
+    if (MS.TrackOrigins) {
+      if (PropagateShadow) {
+        unsigned Alignment = I.getAlignment();
+        unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+        setOrigin(&I, IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB, Alignment),
+                                            OriginAlignment));
+      } else {
+        setOrigin(&I, getCleanOrigin());
+      }
+    }
+  }
+
+  /// \brief Instrument StoreInst
+  ///
+  /// Stores the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the store address is fully defined.
+  void visitStoreInst(StoreInst &I) {
+    StoreList.push_back(&I);
+  }
+
+  void handleCASOrRMW(Instruction &I) {
+    assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I));
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getOperand(0);
+    Value *ShadowPtr = getShadowPtr(Addr, I.getType(), IRB);
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    // Only test the conditional argument of cmpxchg instruction.
+    // The other argument can potentially be uninitialized, but we can not
+    // detect this situation reliably without possible false positives.
+    if (isa<AtomicCmpXchgInst>(I))
+      insertShadowCheck(I.getOperand(1), &I);
+
+    IRB.CreateStore(getCleanShadow(&I), ShadowPtr);
+
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitAtomicRMWInst(AtomicRMWInst &I) {
+    handleCASOrRMW(I);
+    I.setOrdering(addReleaseOrdering(I.getOrdering()));
+  }
+
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    handleCASOrRMW(I);
+    I.setSuccessOrdering(addReleaseOrdering(I.getSuccessOrdering()));
+  }
+
+  // Vector manipulation.
+  void visitExtractElementInst(ExtractElementInst &I) {
+    insertShadowCheck(I.getOperand(1), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1),
+              "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitInsertElementInst(InsertElementInst &I) {
+    insertShadowCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShuffleVectorInst(ShuffleVectorInst &I) {
+    insertShadowCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  // Casts.
+  void visitSExtInst(SExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateSExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitZExtInst(ZExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateZExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitTruncInst(TruncInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateTrunc(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitBitCastInst(BitCastInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I)));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitPtrToIntInst(PtrToIntInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_ptrtoint"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitIntToPtrInst(IntToPtrInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_inttoptr"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitFPToSIInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPToUIInst(CastInst& I) { handleShadowOr(I); }
+  void visitSIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitUIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPExtInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPTruncInst(CastInst& I) { handleShadowOr(I); }
+
+  /// \brief Propagate shadow for bitwise AND.
+  ///
+  /// This code is exact, i.e. if, for example, a bit in the left argument
+  /// is defined and 0, then neither the value not definedness of the
+  /// corresponding bit in B don't affect the resulting shadow.
+  void visitAnd(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "And" of 0 and a poisoned value results in unpoisoned value.
+    //  1&1 => 1;     0&1 => 0;     p&1 => p;
+    //  1&0 => 0;     0&0 => 0;     p&0 => 0;
+    //  1&p => p;     0&p => 0;     p&p => p;
+    //  S = (S1 & S2) | (V1 & S2) | (S1 & V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr(S1S2, IRB.CreateOr(V1S2, S1V2)));
+    setOriginForNaryOp(I);
+  }
+
+  void visitOr(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "Or" of 1 and a poisoned value results in unpoisoned value.
+    //  1|1 => 1;     0|1 => 1;     p|1 => 1;
+    //  1|0 => 1;     0|0 => 0;     p|0 => p;
+    //  1|p => 1;     0|p => p;     p|p => p;
+    //  S = (S1 & S2) | (~V1 & S2) | (S1 & ~V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = IRB.CreateNot(I.getOperand(0));
+    Value *V2 = IRB.CreateNot(I.getOperand(1));
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr(S1S2, IRB.CreateOr(V1S2, S1V2)));
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Default propagation of shadow and/or origin.
+  ///
+  /// This class implements the general case of shadow propagation, used in all
+  /// cases where we don't know and/or don't care about what the operation
+  /// actually does. It converts all input shadow values to a common type
+  /// (extending or truncating as necessary), and bitwise OR's them.
+  ///
+  /// This is much cheaper than inserting checks (i.e. requiring inputs to be
+  /// fully initialized), and less prone to false positives.
+  ///
+  /// This class also implements the general case of origin propagation. For a
+  /// Nary operation, result origin is set to the origin of an argument that is
+  /// not entirely initialized. If there is more than one such arguments, the
+  /// rightmost of them is picked. It does not matter which one is picked if all
+  /// arguments are initialized.
+  template <bool CombineShadow>
+  class Combiner {
+    Value *Shadow;
+    Value *Origin;
+    IRBuilder<> &IRB;
+    MemorySanitizerVisitor *MSV;
+
+  public:
+    Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB) :
+      Shadow(nullptr), Origin(nullptr), IRB(IRB), MSV(MSV) {}
+
+    /// \brief Add a pair of shadow and origin values to the mix.
+    Combiner &Add(Value *OpShadow, Value *OpOrigin) {
+      if (CombineShadow) {
+        assert(OpShadow);
+        if (!Shadow)
+          Shadow = OpShadow;
+        else {
+          OpShadow = MSV->CreateShadowCast(IRB, OpShadow, Shadow->getType());
+          Shadow = IRB.CreateOr(Shadow, OpShadow, "_msprop");
+        }
+      }
+
+      if (MSV->MS.TrackOrigins) {
+        assert(OpOrigin);
+        if (!Origin) {
+          Origin = OpOrigin;
+        } else {
+          Constant *ConstOrigin = dyn_cast<Constant>(OpOrigin);
+          // No point in adding something that might result in 0 origin value.
+          if (!ConstOrigin || !ConstOrigin->isNullValue()) {
+            Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB);
+            Value *Cond =
+                IRB.CreateICmpNE(FlatShadow, MSV->getCleanShadow(FlatShadow));
+            Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
+          }
+        }
+      }
+      return *this;
+    }
+
+    /// \brief Add an application value to the mix.
+    Combiner &Add(Value *V) {
+      Value *OpShadow = MSV->getShadow(V);
+      Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : nullptr;
+      return Add(OpShadow, OpOrigin);
+    }
+
+    /// \brief Set the current combined values as the given instruction's shadow
+    /// and origin.
+    void Done(Instruction *I) {
+      if (CombineShadow) {
+        assert(Shadow);
+        Shadow = MSV->CreateShadowCast(IRB, Shadow, MSV->getShadowTy(I));
+        MSV->setShadow(I, Shadow);
+      }
+      if (MSV->MS.TrackOrigins) {
+        assert(Origin);
+        MSV->setOrigin(I, Origin);
+      }
+    }
+  };
+
+  typedef Combiner<true> ShadowAndOriginCombiner;
+  typedef Combiner<false> OriginCombiner;
+
+  /// \brief Propagate origin for arbitrary operation.
+  void setOriginForNaryOp(Instruction &I) {
+    if (!MS.TrackOrigins) return;
+    IRBuilder<> IRB(&I);
+    OriginCombiner OC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      OC.Add(OI->get());
+    OC.Done(&I);
+  }
+
+  size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) {
+    assert(!(Ty->isVectorTy() && Ty->getScalarType()->isPointerTy()) &&
+           "Vector of pointers is not a valid shadow type");
+    return Ty->isVectorTy() ?
+      Ty->getVectorNumElements() * Ty->getScalarSizeInBits() :
+      Ty->getPrimitiveSizeInBits();
+  }
+
+  /// \brief Cast between two shadow types, extending or truncating as
+  /// necessary.
+  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
+                          bool Signed = false) {
+    Type *srcTy = V->getType();
+    if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
+      return IRB.CreateIntCast(V, dstTy, Signed);
+    if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
+        dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
+      return IRB.CreateIntCast(V, dstTy, Signed);
+    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+    Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
+    Value *V2 =
+      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
+    return IRB.CreateBitCast(V2, dstTy);
+    // TODO: handle struct types.
+  }
+
+  /// \brief Cast an application value to the type of its own shadow.
+  Value *CreateAppToShadowCast(IRBuilder<> &IRB, Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (V->getType() == ShadowTy)
+      return V;
+    if (V->getType()->isPtrOrPtrVectorTy())
+      return IRB.CreatePtrToInt(V, ShadowTy);
+    else
+      return IRB.CreateBitCast(V, ShadowTy);
+  }
+
+  /// \brief Propagate shadow for arbitrary operation.
+  void handleShadowOr(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      SC.Add(OI->get());
+    SC.Done(&I);
+  }
+
+  // \brief Handle multiplication by constant.
+  //
+  // Handle a special case of multiplication by constant that may have one or
+  // more zeros in the lower bits. This makes corresponding number of lower bits
+  // of the result zero as well. We model it by shifting the other operand
+  // shadow left by the required number of bits. Effectively, we transform
+  // (X * (A * 2**B)) to ((X << B) * A) and instrument (X << B) as (Sx << B).
+  // We use multiplication by 2**N instead of shift to cover the case of
+  // multiplication by 0, which may occur in some elements of a vector operand.
+  void handleMulByConstant(BinaryOperator &I, Constant *ConstArg,
+                           Value *OtherArg) {
+    Constant *ShadowMul;
+    Type *Ty = ConstArg->getType();
+    if (Ty->isVectorTy()) {
+      unsigned NumElements = Ty->getVectorNumElements();
+      Type *EltTy = Ty->getSequentialElementType();
+      SmallVector<Constant *, 16> Elements;
+      for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+        ConstantInt *Elt =
+            dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx));
+        APInt V = Elt->getValue();
+        APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+        Elements.push_back(ConstantInt::get(EltTy, V2));
+      }
+      ShadowMul = ConstantVector::get(Elements);
+    } else {
+      ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg);
+      APInt V = Elt->getValue();
+      APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+      ShadowMul = ConstantInt::get(Elt->getType(), V2);
+    }
+
+    IRBuilder<> IRB(&I);
+    setShadow(&I,
+              IRB.CreateMul(getShadow(OtherArg), ShadowMul, "msprop_mul_cst"));
+    setOrigin(&I, getOrigin(OtherArg));
+  }
+
+  void visitMul(BinaryOperator &I) {
+    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
+    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
+    if (constOp0 && !constOp1)
+      handleMulByConstant(I, constOp0, I.getOperand(1));
+    else if (constOp1 && !constOp0)
+      handleMulByConstant(I, constOp1, I.getOperand(0));
+    else
+      handleShadowOr(I);
+  }
+
+  void visitFAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFMul(BinaryOperator &I) { handleShadowOr(I); }
+  void visitAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitXor(BinaryOperator &I) { handleShadowOr(I); }
+
+  void handleDiv(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    // Strict on the second argument.
+    insertShadowCheck(I.getOperand(1), &I);
+    setShadow(&I, getShadow(&I, 0));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitUDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitSDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitFDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitURem(BinaryOperator &I) { handleDiv(I); }
+  void visitSRem(BinaryOperator &I) { handleDiv(I); }
+  void visitFRem(BinaryOperator &I) { handleDiv(I); }
+
+  /// \brief Instrument == and != comparisons.
+  ///
+  /// Sometimes the comparison result is known even if some of the bits of the
+  /// arguments are not.
+  void handleEqualityComparison(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // A == B  <==>  (C = A^B) == 0
+    // A != B  <==>  (C = A^B) != 0
+    // Sc = Sa | Sb
+    Value *C = IRB.CreateXor(A, B);
+    Value *Sc = IRB.CreateOr(Sa, Sb);
+    // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now)
+    // Result is defined if one of the following is true
+    // * there is a defined 1 bit in C
+    // * C is fully defined
+    // Si = !(C & ~Sc) && Sc
+    Value *Zero = Constant::getNullValue(Sc->getType());
+    Value *MinusOne = Constant::getAllOnesValue(Sc->getType());
+    Value *Si =
+      IRB.CreateAnd(IRB.CreateICmpNE(Sc, Zero),
+                    IRB.CreateICmpEQ(
+                      IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero));
+    Si->setName("_msprop_icmp");
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Build the lowest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Maximise the undefined shadow bit, minimize other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaOtherBits)), SaSignBit);
+    } else {
+      // Minimize undefined bits.
+      return IRB.CreateAnd(A, IRB.CreateNot(Sa));
+    }
+  }
+
+  /// \brief Build the highest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Minimise the undefined shadow bit, maximise other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaSignBit)), SaOtherBits);
+    } else {
+      // Maximize undefined bits.
+      return IRB.CreateOr(A, Sa);
+    }
+  }
+
+  /// \brief Instrument relational comparisons.
+  ///
+  /// This function does exact shadow propagation for all relational
+  /// comparisons of integers, pointers and vectors of those.
+  /// FIXME: output seems suboptimal when one of the operands is a constant
+  void handleRelationalComparisonExact(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // Let [a0, a1] be the interval of possible values of A, taking into account
+    // its undefined bits. Let [b0, b1] be the interval of possible values of B.
+    // Then (A cmp B) is defined iff (a0 cmp b1) == (a1 cmp b0).
+    bool IsSigned = I.isSigned();
+    Value *S1 = IRB.CreateICmp(I.getPredicate(),
+                               getLowestPossibleValue(IRB, A, Sa, IsSigned),
+                               getHighestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *S2 = IRB.CreateICmp(I.getPredicate(),
+                               getHighestPossibleValue(IRB, A, Sa, IsSigned),
+                               getLowestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *Si = IRB.CreateXor(S1, S2);
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Instrument signed relational comparisons.
+  ///
+  /// Handle (x<0) and (x>=0) comparisons (essentially, sign bit tests) by
+  /// propagating the highest bit of the shadow. Everything else is delegated
+  /// to handleShadowOr().
+  void handleSignedRelationalComparison(ICmpInst &I) {
+    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
+    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
+    Value* op = nullptr;
+    CmpInst::Predicate pre = I.getPredicate();
+    if (constOp0 && constOp0->isNullValue() &&
+        (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE)) {
+      op = I.getOperand(1);
+    } else if (constOp1 && constOp1->isNullValue() &&
+               (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) {
+      op = I.getOperand(0);
+    }
+    if (op) {
+      IRBuilder<> IRB(&I);
+      Value* Shadow =
+        IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), "_msprop_icmpslt");
+      setShadow(&I, Shadow);
+      setOrigin(&I, getOrigin(op));
+    } else {
+      handleShadowOr(I);
+    }
+  }
+
+  void visitICmpInst(ICmpInst &I) {
+    if (!ClHandleICmp) {
+      handleShadowOr(I);
+      return;
+    }
+    if (I.isEquality()) {
+      handleEqualityComparison(I);
+      return;
+    }
+
+    assert(I.isRelational());
+    if (ClHandleICmpExact) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+    if (I.isSigned()) {
+      handleSignedRelationalComparison(I);
+      return;
+    }
+
+    assert(I.isUnsigned());
+    if ((isa<Constant>(I.getOperand(0)) || isa<Constant>(I.getOperand(1)))) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+
+    handleShadowOr(I);
+  }
+
+  void visitFCmpInst(FCmpInst &I) {
+    handleShadowOr(I);
+  }
+
+  void handleShift(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    // If any of the S2 bits are poisoned, the whole thing is poisoned.
+    // Otherwise perform the same shift on S1.
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)),
+                                   S2->getType());
+    Value *V2 = I.getOperand(1);
+    Value *Shift = IRB.CreateBinOp(I.getOpcode(), S1, V2);
+    setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShl(BinaryOperator &I) { handleShift(I); }
+  void visitAShr(BinaryOperator &I) { handleShift(I); }
+  void visitLShr(BinaryOperator &I) { handleShift(I); }
+
+  /// \brief Instrument llvm.memmove
+  ///
+  /// At this point we don't know if llvm.memmove will be inlined or not.
+  /// If we don't instrument it and it gets inlined,
+  /// our interceptor will not kick in and we will lose the memmove.
+  /// If we instrument the call here, but it does not get inlined,
+  /// we will memove the shadow twice: which is bad in case
+  /// of overlapping regions. So, we simply lower the intrinsic to a call.
+  ///
+  /// Similar situation exists for memcpy and memset.
+  void visitMemMoveInst(MemMoveInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall(
+        MS.MemmoveFn,
+        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+    I.eraseFromParent();
+  }
+
+  // Similar to memmove: avoid copying shadow twice.
+  // This is somewhat unfortunate as it may slowdown small constant memcpys.
+  // FIXME: consider doing manual inline for small constant sizes and proper
+  // alignment.
+  void visitMemCpyInst(MemCpyInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall(
+        MS.MemcpyFn,
+        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+    I.eraseFromParent();
+  }
+
+  // Same as memcpy.
+  void visitMemSetInst(MemSetInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall(
+        MS.MemsetFn,
+        {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+    I.eraseFromParent();
+  }
+
+  void visitVAStartInst(VAStartInst &I) {
+    VAHelper->visitVAStartInst(I);
+  }
+
+  void visitVACopyInst(VACopyInst &I) {
+    VAHelper->visitVACopyInst(I);
+  }
+
+  enum IntrinsicKind {
+    IK_DoesNotAccessMemory,
+    IK_OnlyReadsMemory,
+    IK_WritesMemory
+  };
+
+  static IntrinsicKind getIntrinsicKind(Intrinsic::ID iid) {
+    const int DoesNotAccessMemory = IK_DoesNotAccessMemory;
+    const int OnlyReadsArgumentPointees = IK_OnlyReadsMemory;
+    const int OnlyReadsMemory = IK_OnlyReadsMemory;
+    const int OnlyAccessesArgumentPointees = IK_WritesMemory;
+    const int UnknownModRefBehavior = IK_WritesMemory;
+#define GET_INTRINSIC_MODREF_BEHAVIOR
+#define ModRefBehavior IntrinsicKind
+#include "llvm/IR/Intrinsics.gen"
+#undef ModRefBehavior
+#undef GET_INTRINSIC_MODREF_BEHAVIOR
+  }
+
+  /// \brief Handle vector store-like intrinsics.
+  ///
+  /// Instrument intrinsics that look like a simple SIMD store: writes memory,
+  /// has 1 pointer argument and 1 vector argument, returns void.
+  bool handleVectorStoreIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value* Addr = I.getArgOperand(0);
+    Value *Shadow = getShadow(&I, 1);
+    Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
+
+    // We don't know the pointer alignment (could be unaligned SSE store!).
+    // Have to assume to worst case.
+    IRB.CreateAlignedStore(Shadow, ShadowPtr, 1);
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    // FIXME: use ClStoreCleanOrigin
+    // FIXME: factor out common code from materializeStores
+    if (MS.TrackOrigins)
+      IRB.CreateStore(getOrigin(&I, 1), getOriginPtr(Addr, IRB, 1));
+    return true;
+  }
+
+  /// \brief Handle vector load-like intrinsics.
+  ///
+  /// Instrument intrinsics that look like a simple SIMD load: reads memory,
+  /// has 1 pointer argument, returns a vector.
+  bool handleVectorLoadIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+
+    Type *ShadowTy = getShadowTy(&I);
+    if (PropagateShadow) {
+      Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB);
+      // We don't know the pointer alignment (could be unaligned SSE load!).
+      // Have to assume to worst case.
+      setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, 1, "_msld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    if (MS.TrackOrigins) {
+      if (PropagateShadow)
+        setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB, 1)));
+      else
+        setOrigin(&I, getCleanOrigin());
+    }
+    return true;
+  }
+
+  /// \brief Handle (SIMD arithmetic)-like intrinsics.
+  ///
+  /// Instrument intrinsics with any number of arguments of the same type,
+  /// equal to the return type. The type should be simple (no aggregates or
+  /// pointers; vectors are fine).
+  /// Caller guarantees that this intrinsic does not access memory.
+  bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
+    Type *RetTy = I.getType();
+    if (!(RetTy->isIntOrIntVectorTy() ||
+          RetTy->isFPOrFPVectorTy() ||
+          RetTy->isX86_MMXTy()))
+      return false;
+
+    unsigned NumArgOperands = I.getNumArgOperands();
+
+    for (unsigned i = 0; i < NumArgOperands; ++i) {
+      Type *Ty = I.getArgOperand(i)->getType();
+      if (Ty != RetTy)
+        return false;
+    }
+
+    IRBuilder<> IRB(&I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (unsigned i = 0; i < NumArgOperands; ++i)
+      SC.Add(I.getArgOperand(i));
+    SC.Done(&I);
+
+    return true;
+  }
+
+  /// \brief Heuristically instrument unknown intrinsics.
+  ///
+  /// The main purpose of this code is to do something reasonable with all
+  /// random intrinsics we might encounter, most importantly - SIMD intrinsics.
+  /// We recognize several classes of intrinsics by their argument types and
+  /// ModRefBehaviour and apply special intrumentation when we are reasonably
+  /// sure that we know what the intrinsic does.
+  ///
+  /// We special-case intrinsics where this approach fails. See llvm.bswap
+  /// handling as an example of that.
+  bool handleUnknownIntrinsic(IntrinsicInst &I) {
+    unsigned NumArgOperands = I.getNumArgOperands();
+    if (NumArgOperands == 0)
+      return false;
+
+    Intrinsic::ID iid = I.getIntrinsicID();
+    IntrinsicKind IK = getIntrinsicKind(iid);
+    bool OnlyReadsMemory = IK == IK_OnlyReadsMemory;
+    bool WritesMemory = IK == IK_WritesMemory;
+    assert(!(OnlyReadsMemory && WritesMemory));
+
+    if (NumArgOperands == 2 &&
+        I.getArgOperand(0)->getType()->isPointerTy() &&
+        I.getArgOperand(1)->getType()->isVectorTy() &&
+        I.getType()->isVoidTy() &&
+        WritesMemory) {
+      // This looks like a vector store.
+      return handleVectorStoreIntrinsic(I);
+    }
+
+    if (NumArgOperands == 1 &&
+        I.getArgOperand(0)->getType()->isPointerTy() &&
+        I.getType()->isVectorTy() &&
+        OnlyReadsMemory) {
+      // This looks like a vector load.
+      return handleVectorLoadIntrinsic(I);
+    }
+
+    if (!OnlyReadsMemory && !WritesMemory)
+      if (maybeHandleSimpleNomemIntrinsic(I))
+        return true;
+
+    // FIXME: detect and handle SSE maskstore/maskload
+    return false;
+  }
+
+  void handleBswap(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Op = I.getArgOperand(0);
+    Type *OpType = Op->getType();
+    Function *BswapFunc = Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::bswap, makeArrayRef(&OpType, 1));
+    setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op)));
+    setOrigin(&I, getOrigin(Op));
+  }
+
+  // \brief Instrument vector convert instrinsic.
+  //
+  // This function instruments intrinsics like cvtsi2ss:
+  // %Out = int_xxx_cvtyyy(%ConvertOp)
+  // or
+  // %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp)
+  // Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same
+  // number \p Out elements, and (if has 2 arguments) copies the rest of the
+  // elements from \p CopyOp.
+  // In most cases conversion involves floating-point value which may trigger a
+  // hardware exception when not fully initialized. For this reason we require
+  // \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise.
+  // We copy the shadow of \p CopyOp[NumUsedElements:] to \p
+  // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always
+  // return a fully initialized value.
+  void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements) {
+    IRBuilder<> IRB(&I);
+    Value *CopyOp, *ConvertOp;
+
+    switch (I.getNumArgOperands()) {
+    case 3:
+      assert(isa<ConstantInt>(I.getArgOperand(2)) && "Invalid rounding mode");
+    case 2:
+      CopyOp = I.getArgOperand(0);
+      ConvertOp = I.getArgOperand(1);
+      break;
+    case 1:
+      ConvertOp = I.getArgOperand(0);
+      CopyOp = nullptr;
+      break;
+    default:
+      llvm_unreachable("Cvt intrinsic with unsupported number of arguments.");
+    }
+
+    // The first *NumUsedElements* elements of ConvertOp are converted to the
+    // same number of output elements. The rest of the output is copied from
+    // CopyOp, or (if not available) filled with zeroes.
+    // Combine shadow for elements of ConvertOp that are used in this operation,
+    // and insert a check.
+    // FIXME: consider propagating shadow of ConvertOp, at least in the case of
+    // int->any conversion.
+    Value *ConvertShadow = getShadow(ConvertOp);
+    Value *AggShadow = nullptr;
+    if (ConvertOp->getType()->isVectorTy()) {
+      AggShadow = IRB.CreateExtractElement(
+          ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+      for (int i = 1; i < NumUsedElements; ++i) {
+        Value *MoreShadow = IRB.CreateExtractElement(
+            ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+        AggShadow = IRB.CreateOr(AggShadow, MoreShadow);
+      }
+    } else {
+      AggShadow = ConvertShadow;
+    }
+    assert(AggShadow->getType()->isIntegerTy());
+    insertShadowCheck(AggShadow, getOrigin(ConvertOp), &I);
+
+    // Build result shadow by zero-filling parts of CopyOp shadow that come from
+    // ConvertOp.
+    if (CopyOp) {
+      assert(CopyOp->getType() == I.getType());
+      assert(CopyOp->getType()->isVectorTy());
+      Value *ResultShadow = getShadow(CopyOp);
+      Type *EltTy = ResultShadow->getType()->getVectorElementType();
+      for (int i = 0; i < NumUsedElements; ++i) {
+        ResultShadow = IRB.CreateInsertElement(
+            ResultShadow, ConstantInt::getNullValue(EltTy),
+            ConstantInt::get(IRB.getInt32Ty(), i));
+      }
+      setShadow(&I, ResultShadow);
+      setOrigin(&I, getOrigin(CopyOp));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
+    }
+  }
+
+  // Given a scalar or vector, extract lower 64 bits (or less), and return all
+  // zeroes if it is zero, and all ones otherwise.
+  Value *Lower64ShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) {
+    if (S->getType()->isVectorTy())
+      S = CreateShadowCast(IRB, S, IRB.getInt64Ty(), /* Signed */ true);
+    assert(S->getType()->getPrimitiveSizeInBits() <= 64);
+    Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S));
+    return CreateShadowCast(IRB, S2, T, /* Signed */ true);
+  }
+
+  Value *VariableShadowExtend(IRBuilder<> &IRB, Value *S) {
+    Type *T = S->getType();
+    assert(T->isVectorTy());
+    Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S));
+    return IRB.CreateSExt(S2, T);
+  }
+
+  // \brief Instrument vector shift instrinsic.
+  //
+  // This function instruments intrinsics like int_x86_avx2_psll_w.
+  // Intrinsic shifts %In by %ShiftSize bits.
+  // %ShiftSize may be a vector. In that case the lower 64 bits determine shift
+  // size, and the rest is ignored. Behavior is defined even if shift size is
+  // greater than register (or field) width.
+  void handleVectorShiftIntrinsic(IntrinsicInst &I, bool Variable) {
+    assert(I.getNumArgOperands() == 2);
+    IRBuilder<> IRB(&I);
+    // If any of the S2 bits are poisoned, the whole thing is poisoned.
+    // Otherwise perform the same shift on S1.
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *S2Conv = Variable ? VariableShadowExtend(IRB, S2)
+                             : Lower64ShadowExtend(IRB, S2, getShadowTy(&I));
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
+    Value *Shift = IRB.CreateCall(I.getCalledValue(),
+                                  {IRB.CreateBitCast(S1, V1->getType()), V2});
+    Shift = IRB.CreateBitCast(Shift, getShadowTy(&I));
+    setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Get an X86_MMX-sized vector type.
+  Type *getMMXVectorTy(unsigned EltSizeInBits) {
+    const unsigned X86_MMXSizeInBits = 64;
+    return VectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
+                           X86_MMXSizeInBits / EltSizeInBits);
+  }
+
+  // \brief Returns a signed counterpart for an (un)signed-saturate-and-pack
+  // intrinsic.
+  Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) {
+    switch (id) {
+      case llvm::Intrinsic::x86_sse2_packsswb_128:
+      case llvm::Intrinsic::x86_sse2_packuswb_128:
+        return llvm::Intrinsic::x86_sse2_packsswb_128;
+
+      case llvm::Intrinsic::x86_sse2_packssdw_128:
+      case llvm::Intrinsic::x86_sse41_packusdw:
+        return llvm::Intrinsic::x86_sse2_packssdw_128;
+
+      case llvm::Intrinsic::x86_avx2_packsswb:
+      case llvm::Intrinsic::x86_avx2_packuswb:
+        return llvm::Intrinsic::x86_avx2_packsswb;
+
+      case llvm::Intrinsic::x86_avx2_packssdw:
+      case llvm::Intrinsic::x86_avx2_packusdw:
+        return llvm::Intrinsic::x86_avx2_packssdw;
+
+      case llvm::Intrinsic::x86_mmx_packsswb:
+      case llvm::Intrinsic::x86_mmx_packuswb:
+        return llvm::Intrinsic::x86_mmx_packsswb;
+
+      case llvm::Intrinsic::x86_mmx_packssdw:
+        return llvm::Intrinsic::x86_mmx_packssdw;
+      default:
+        llvm_unreachable("unexpected intrinsic id");
+    }
+  }
+
+  // \brief Instrument vector pack instrinsic.
+  //
+  // This function instruments intrinsics like x86_mmx_packsswb, that
+  // packs elements of 2 input vectors into half as many bits with saturation.
+  // Shadow is propagated with the signed variant of the same intrinsic applied
+  // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
+  // EltSizeInBits is used only for x86mmx arguments.
+  void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
+    assert(I.getNumArgOperands() == 2);
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    IRBuilder<> IRB(&I);
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    assert(isX86_MMX || S1->getType()->isVectorTy());
+
+    // SExt and ICmpNE below must apply to individual elements of input vectors.
+    // In case of x86mmx arguments, cast them to appropriate vector types and
+    // back.
+    Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType();
+    if (isX86_MMX) {
+      S1 = IRB.CreateBitCast(S1, T);
+      S2 = IRB.CreateBitCast(S2, T);
+    }
+    Value *S1_ext = IRB.CreateSExt(
+        IRB.CreateICmpNE(S1, llvm::Constant::getNullValue(T)), T);
+    Value *S2_ext = IRB.CreateSExt(
+        IRB.CreateICmpNE(S2, llvm::Constant::getNullValue(T)), T);
+    if (isX86_MMX) {
+      Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C);
+      S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy);
+      S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy);
+    }
+
+    Function *ShadowFn = Intrinsic::getDeclaration(
+        F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID()));
+
+    Value *S =
+        IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack");
+    if (isX86_MMX) S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Instrument sum-of-absolute-differencies intrinsic.
+  void handleVectorSadIntrinsic(IntrinsicInst &I) {
+    const unsigned SignificantBitsPerResultElement = 16;
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType();
+    unsigned ZeroBitsPerResultElement =
+        ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
+
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateLShr(S, ZeroBitsPerResultElement);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  // \brief Instrument multiply-add intrinsic.
+  void handleVectorPmaddIntrinsic(IntrinsicInst &I,
+                                  unsigned EltSizeInBits = 0) {
+    bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+    Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType();
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+    S = IRB.CreateBitCast(S, ResTy);
+    S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+                       ResTy);
+    S = IRB.CreateBitCast(S, getShadowTy(&I));
+    setShadow(&I, S);
+    setOriginForNaryOp(I);
+  }
+
+  void visitIntrinsicInst(IntrinsicInst &I) {
+    switch (I.getIntrinsicID()) {
+    case llvm::Intrinsic::bswap:
+      handleBswap(I);
+      break;
+    case llvm::Intrinsic::x86_avx512_cvtsd2usi64:
+    case llvm::Intrinsic::x86_avx512_cvtsd2usi:
+    case llvm::Intrinsic::x86_avx512_cvtss2usi64:
+    case llvm::Intrinsic::x86_avx512_cvtss2usi:
+    case llvm::Intrinsic::x86_avx512_cvttss2usi64:
+    case llvm::Intrinsic::x86_avx512_cvttss2usi:
+    case llvm::Intrinsic::x86_avx512_cvttsd2usi64:
+    case llvm::Intrinsic::x86_avx512_cvttsd2usi:
+    case llvm::Intrinsic::x86_avx512_cvtusi2sd:
+    case llvm::Intrinsic::x86_avx512_cvtusi2ss:
+    case llvm::Intrinsic::x86_avx512_cvtusi642sd:
+    case llvm::Intrinsic::x86_avx512_cvtusi642ss:
+    case llvm::Intrinsic::x86_sse2_cvtsd2si64:
+    case llvm::Intrinsic::x86_sse2_cvtsd2si:
+    case llvm::Intrinsic::x86_sse2_cvtsd2ss:
+    case llvm::Intrinsic::x86_sse2_cvtsi2sd:
+    case llvm::Intrinsic::x86_sse2_cvtsi642sd:
+    case llvm::Intrinsic::x86_sse2_cvtss2sd:
+    case llvm::Intrinsic::x86_sse2_cvttsd2si64:
+    case llvm::Intrinsic::x86_sse2_cvttsd2si:
+    case llvm::Intrinsic::x86_sse_cvtsi2ss:
+    case llvm::Intrinsic::x86_sse_cvtsi642ss:
+    case llvm::Intrinsic::x86_sse_cvtss2si64:
+    case llvm::Intrinsic::x86_sse_cvtss2si:
+    case llvm::Intrinsic::x86_sse_cvttss2si64:
+    case llvm::Intrinsic::x86_sse_cvttss2si:
+      handleVectorConvertIntrinsic(I, 1);
+      break;
+    case llvm::Intrinsic::x86_sse2_cvtdq2pd:
+    case llvm::Intrinsic::x86_sse2_cvtps2pd:
+    case llvm::Intrinsic::x86_sse_cvtps2pi:
+    case llvm::Intrinsic::x86_sse_cvttps2pi:
+      handleVectorConvertIntrinsic(I, 2);
+      break;
+    case llvm::Intrinsic::x86_avx2_psll_w:
+    case llvm::Intrinsic::x86_avx2_psll_d:
+    case llvm::Intrinsic::x86_avx2_psll_q:
+    case llvm::Intrinsic::x86_avx2_pslli_w:
+    case llvm::Intrinsic::x86_avx2_pslli_d:
+    case llvm::Intrinsic::x86_avx2_pslli_q:
+    case llvm::Intrinsic::x86_avx2_psrl_w:
+    case llvm::Intrinsic::x86_avx2_psrl_d:
+    case llvm::Intrinsic::x86_avx2_psrl_q:
+    case llvm::Intrinsic::x86_avx2_psra_w:
+    case llvm::Intrinsic::x86_avx2_psra_d:
+    case llvm::Intrinsic::x86_avx2_psrli_w:
+    case llvm::Intrinsic::x86_avx2_psrli_d:
+    case llvm::Intrinsic::x86_avx2_psrli_q:
+    case llvm::Intrinsic::x86_avx2_psrai_w:
+    case llvm::Intrinsic::x86_avx2_psrai_d:
+    case llvm::Intrinsic::x86_sse2_psll_w:
+    case llvm::Intrinsic::x86_sse2_psll_d:
+    case llvm::Intrinsic::x86_sse2_psll_q:
+    case llvm::Intrinsic::x86_sse2_pslli_w:
+    case llvm::Intrinsic::x86_sse2_pslli_d:
+    case llvm::Intrinsic::x86_sse2_pslli_q:
+    case llvm::Intrinsic::x86_sse2_psrl_w:
+    case llvm::Intrinsic::x86_sse2_psrl_d:
+    case llvm::Intrinsic::x86_sse2_psrl_q:
+    case llvm::Intrinsic::x86_sse2_psra_w:
+    case llvm::Intrinsic::x86_sse2_psra_d:
+    case llvm::Intrinsic::x86_sse2_psrli_w:
+    case llvm::Intrinsic::x86_sse2_psrli_d:
+    case llvm::Intrinsic::x86_sse2_psrli_q:
+    case llvm::Intrinsic::x86_sse2_psrai_w:
+    case llvm::Intrinsic::x86_sse2_psrai_d:
+    case llvm::Intrinsic::x86_mmx_psll_w:
+    case llvm::Intrinsic::x86_mmx_psll_d:
+    case llvm::Intrinsic::x86_mmx_psll_q:
+    case llvm::Intrinsic::x86_mmx_pslli_w:
+    case llvm::Intrinsic::x86_mmx_pslli_d:
+    case llvm::Intrinsic::x86_mmx_pslli_q:
+    case llvm::Intrinsic::x86_mmx_psrl_w:
+    case llvm::Intrinsic::x86_mmx_psrl_d:
+    case llvm::Intrinsic::x86_mmx_psrl_q:
+    case llvm::Intrinsic::x86_mmx_psra_w:
+    case llvm::Intrinsic::x86_mmx_psra_d:
+    case llvm::Intrinsic::x86_mmx_psrli_w:
+    case llvm::Intrinsic::x86_mmx_psrli_d:
+    case llvm::Intrinsic::x86_mmx_psrli_q:
+    case llvm::Intrinsic::x86_mmx_psrai_w:
+    case llvm::Intrinsic::x86_mmx_psrai_d:
+      handleVectorShiftIntrinsic(I, /* Variable */ false);
+      break;
+    case llvm::Intrinsic::x86_avx2_psllv_d:
+    case llvm::Intrinsic::x86_avx2_psllv_d_256:
+    case llvm::Intrinsic::x86_avx2_psllv_q:
+    case llvm::Intrinsic::x86_avx2_psllv_q_256:
+    case llvm::Intrinsic::x86_avx2_psrlv_d:
+    case llvm::Intrinsic::x86_avx2_psrlv_d_256:
+    case llvm::Intrinsic::x86_avx2_psrlv_q:
+    case llvm::Intrinsic::x86_avx2_psrlv_q_256:
+    case llvm::Intrinsic::x86_avx2_psrav_d:
+    case llvm::Intrinsic::x86_avx2_psrav_d_256:
+      handleVectorShiftIntrinsic(I, /* Variable */ true);
+      break;
+
+    case llvm::Intrinsic::x86_sse2_packsswb_128:
+    case llvm::Intrinsic::x86_sse2_packssdw_128:
+    case llvm::Intrinsic::x86_sse2_packuswb_128:
+    case llvm::Intrinsic::x86_sse41_packusdw:
+    case llvm::Intrinsic::x86_avx2_packsswb:
+    case llvm::Intrinsic::x86_avx2_packssdw:
+    case llvm::Intrinsic::x86_avx2_packuswb:
+    case llvm::Intrinsic::x86_avx2_packusdw:
+      handleVectorPackIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_packsswb:
+    case llvm::Intrinsic::x86_mmx_packuswb:
+      handleVectorPackIntrinsic(I, 16);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_packssdw:
+      handleVectorPackIntrinsic(I, 32);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_psad_bw:
+    case llvm::Intrinsic::x86_sse2_psad_bw:
+    case llvm::Intrinsic::x86_avx2_psad_bw:
+      handleVectorSadIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_sse2_pmadd_wd:
+    case llvm::Intrinsic::x86_avx2_pmadd_wd:
+    case llvm::Intrinsic::x86_ssse3_pmadd_ub_sw_128:
+    case llvm::Intrinsic::x86_avx2_pmadd_ub_sw:
+      handleVectorPmaddIntrinsic(I);
+      break;
+
+    case llvm::Intrinsic::x86_ssse3_pmadd_ub_sw:
+      handleVectorPmaddIntrinsic(I, 8);
+      break;
+
+    case llvm::Intrinsic::x86_mmx_pmadd_wd:
+      handleVectorPmaddIntrinsic(I, 16);
+      break;
+
+    default:
+      if (!handleUnknownIntrinsic(I))
+        visitInstruction(I);
+      break;
+    }
+  }
+
+  void visitCallSite(CallSite CS) {
+    Instruction &I = *CS.getInstruction();
+    assert((CS.isCall() || CS.isInvoke()) && "Unknown type of CallSite");
+    if (CS.isCall()) {
+      CallInst *Call = cast<CallInst>(&I);
+
+      // For inline asm, do the usual thing: check argument shadow and mark all
+      // outputs as clean. Note that any side effects of the inline asm that are
+      // not immediately visible in its constraints are not handled.
+      if (Call->isInlineAsm()) {
+        visitInstruction(I);
+        return;
+      }
+
+      assert(!isa<IntrinsicInst>(&I) && "intrinsics are handled elsewhere");
+
+      // We are going to insert code that relies on the fact that the callee
+      // will become a non-readonly function after it is instrumented by us. To
+      // prevent this code from being optimized out, mark that function
+      // non-readonly in advance.
+      if (Function *Func = Call->getCalledFunction()) {
+        // Clear out readonly/readnone attributes.
+        AttrBuilder B;
+        B.addAttribute(Attribute::ReadOnly)
+          .addAttribute(Attribute::ReadNone);
+        Func->removeAttributes(AttributeSet::FunctionIndex,
+                               AttributeSet::get(Func->getContext(),
+                                                 AttributeSet::FunctionIndex,
+                                                 B));
+      }
+    }
+    IRBuilder<> IRB(&I);
+
+    unsigned ArgOffset = 0;
+    DEBUG(dbgs() << "  CallSite: " << I << "\n");
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned i = ArgIt - CS.arg_begin();
+      if (!A->getType()->isSized()) {
+        DEBUG(dbgs() << "Arg " << i << " is not sized: " << I << "\n");
+        continue;
+      }
+      unsigned Size = 0;
+      Value *Store = nullptr;
+      // Compute the Shadow for arg even if it is ByVal, because
+      // in that case getShadow() will copy the actual arg shadow to
+      // __msan_param_tls.
+      Value *ArgShadow = getShadow(A);
+      Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
+      DEBUG(dbgs() << "  Arg#" << i << ": " << *A <<
+            " Shadow: " << *ArgShadow << "\n");
+      bool ArgIsInitialized = false;
+      const DataLayout &DL = F.getParent()->getDataLayout();
+      if (CS.paramHasAttr(i + 1, Attribute::ByVal)) {
+        assert(A->getType()->isPointerTy() &&
+               "ByVal argument is not a pointer!");
+        Size = DL.getTypeAllocSize(A->getType()->getPointerElementType());
+        if (ArgOffset + Size > kParamTLSSize) break;
+        unsigned ParamAlignment = CS.getParamAlignment(i + 1);
+        unsigned Alignment = std::min(ParamAlignment, kShadowTLSAlignment);
+        Store = IRB.CreateMemCpy(ArgShadowBase,
+                                 getShadowPtr(A, Type::getInt8Ty(*MS.C), IRB),
+                                 Size, Alignment);
+      } else {
+        Size = DL.getTypeAllocSize(A->getType());
+        if (ArgOffset + Size > kParamTLSSize) break;
+        Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
+                                       kShadowTLSAlignment);
+        Constant *Cst = dyn_cast<Constant>(ArgShadow);
+        if (Cst && Cst->isNullValue()) ArgIsInitialized = true;
+      }
+      if (MS.TrackOrigins && !ArgIsInitialized)
+        IRB.CreateStore(getOrigin(A),
+                        getOriginPtrForArgument(A, IRB, ArgOffset));
+      (void)Store;
+      assert(Size != 0 && Store != nullptr);
+      DEBUG(dbgs() << "  Param:" << *Store << "\n");
+      ArgOffset += RoundUpToAlignment(Size, 8);
+    }
+    DEBUG(dbgs() << "  done with call args\n");
+
+    FunctionType *FT =
+      cast<FunctionType>(CS.getCalledValue()->getType()->getContainedType(0));
+    if (FT->isVarArg()) {
+      VAHelper->visitCallSite(CS, IRB);
+    }
+
+    // Now, get the shadow for the RetVal.
+    if (!I.getType()->isSized()) return;
+    IRBuilder<> IRBBefore(&I);
+    // Until we have full dynamic coverage, make sure the retval shadow is 0.
+    Value *Base = getShadowPtrForRetval(&I, IRBBefore);
+    IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment);
+    Instruction *NextInsn = nullptr;
+    if (CS.isCall()) {
+      NextInsn = I.getNextNode();
+    } else {
+      BasicBlock *NormalDest = cast<InvokeInst>(&I)->getNormalDest();
+      if (!NormalDest->getSinglePredecessor()) {
+        // FIXME: this case is tricky, so we are just conservative here.
+        // Perhaps we need to split the edge between this BB and NormalDest,
+        // but a naive attempt to use SplitEdge leads to a crash.
+        setShadow(&I, getCleanShadow(&I));
+        setOrigin(&I, getCleanOrigin());
+        return;
+      }
+      NextInsn = NormalDest->getFirstInsertionPt();
+      assert(NextInsn &&
+             "Could not find insertion point for retval shadow load");
+    }
+    IRBuilder<> IRBAfter(NextInsn);
+    Value *RetvalShadow =
+      IRBAfter.CreateAlignedLoad(getShadowPtrForRetval(&I, IRBAfter),
+                                 kShadowTLSAlignment, "_msret");
+    setShadow(&I, RetvalShadow);
+    if (MS.TrackOrigins)
+      setOrigin(&I, IRBAfter.CreateLoad(getOriginPtrForRetval(IRBAfter)));
+  }
+
+  void visitReturnInst(ReturnInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *RetVal = I.getReturnValue();
+    if (!RetVal) return;
+    Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
+    if (CheckReturnValue) {
+      insertShadowCheck(RetVal, &I);
+      Value *Shadow = getCleanShadow(RetVal);
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+    } else {
+      Value *Shadow = getShadow(RetVal);
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+      // FIXME: make it conditional if ClStoreCleanOrigin==0
+      if (MS.TrackOrigins)
+        IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
+    }
+  }
+
+  void visitPHINode(PHINode &I) {
+    IRBuilder<> IRB(&I);
+    if (!PropagateShadow) {
+      setShadow(&I, getCleanShadow(&I));
+      setOrigin(&I, getCleanOrigin());
+      return;
+    }
+
+    ShadowPHINodes.push_back(&I);
+    setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(),
+                                "_msphi_s"));
+    if (MS.TrackOrigins)
+      setOrigin(&I, IRB.CreatePHI(MS.OriginTy, I.getNumIncomingValues(),
+                                  "_msphi_o"));
+  }
+
+  void visitAllocaInst(AllocaInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+    IRBuilder<> IRB(I.getNextNode());
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    uint64_t Size = DL.getTypeAllocSize(I.getAllocatedType());
+    if (PoisonStack && ClPoisonStackWithCall) {
+      IRB.CreateCall(MS.MsanPoisonStackFn,
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
+                      ConstantInt::get(MS.IntptrTy, Size)});
+    } else {
+      Value *ShadowBase = getShadowPtr(&I, Type::getInt8PtrTy(*MS.C), IRB);
+      Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
+      IRB.CreateMemSet(ShadowBase, PoisonValue, Size, I.getAlignment());
+    }
+
+    if (PoisonStack && MS.TrackOrigins) {
+      SmallString<2048> StackDescriptionStorage;
+      raw_svector_ostream StackDescription(StackDescriptionStorage);
+      // We create a string with a description of the stack allocation and
+      // pass it into __msan_set_alloca_origin.
+      // It will be printed by the run-time if stack-originated UMR is found.
+      // The first 4 bytes of the string are set to '----' and will be replaced
+      // by __msan_va_arg_overflow_size_tls at the first call.
+      StackDescription << "----" << I.getName() << "@" << F.getName();
+      Value *Descr =
+          createPrivateNonConstGlobalForString(*F.getParent(),
+                                               StackDescription.str());
+
+      IRB.CreateCall(MS.MsanSetAllocaOrigin4Fn,
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
+                      ConstantInt::get(MS.IntptrTy, Size),
+                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()),
+                      IRB.CreatePointerCast(&F, MS.IntptrTy)});
+    }
+  }
+
+  void visitSelectInst(SelectInst& I) {
+    IRBuilder<> IRB(&I);
+    // a = select b, c, d
+    Value *B = I.getCondition();
+    Value *C = I.getTrueValue();
+    Value *D = I.getFalseValue();
+    Value *Sb = getShadow(B);
+    Value *Sc = getShadow(C);
+    Value *Sd = getShadow(D);
+
+    // Result shadow if condition shadow is 0.
+    Value *Sa0 = IRB.CreateSelect(B, Sc, Sd);
+    Value *Sa1;
+    if (I.getType()->isAggregateType()) {
+      // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do
+      // an extra "select". This results in much more compact IR.
+      // Sa = select Sb, poisoned, (select b, Sc, Sd)
+      Sa1 = getPoisonedShadow(getShadowTy(I.getType()));
+    } else {
+      // Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ]
+      // If Sb (condition is poisoned), look for bits in c and d that are equal
+      // and both unpoisoned.
+      // If !Sb (condition is unpoisoned), simply pick one of Sc and Sd.
+
+      // Cast arguments to shadow-compatible type.
+      C = CreateAppToShadowCast(IRB, C);
+      D = CreateAppToShadowCast(IRB, D);
+
+      // Result shadow if condition shadow is 1.
+      Sa1 = IRB.CreateOr(IRB.CreateXor(C, D), IRB.CreateOr(Sc, Sd));
+    }
+    Value *Sa = IRB.CreateSelect(Sb, Sa1, Sa0, "_msprop_select");
+    setShadow(&I, Sa);
+    if (MS.TrackOrigins) {
+      // Origins are always i32, so any vector conditions must be flattened.
+      // FIXME: consider tracking vector origins for app vectors?
+      if (B->getType()->isVectorTy()) {
+        Type *FlatTy = getShadowTyNoVec(B->getType());
+        B = IRB.CreateICmpNE(IRB.CreateBitCast(B, FlatTy),
+                                ConstantInt::getNullValue(FlatTy));
+        Sb = IRB.CreateICmpNE(IRB.CreateBitCast(Sb, FlatTy),
+                                      ConstantInt::getNullValue(FlatTy));
+      }
+      // a = select b, c, d
+      // Oa = Sb ? Ob : (b ? Oc : Od)
+      setOrigin(
+          &I, IRB.CreateSelect(Sb, getOrigin(I.getCondition()),
+                               IRB.CreateSelect(B, getOrigin(I.getTrueValue()),
+                                                getOrigin(I.getFalseValue()))));
+    }
+  }
+
+  void visitLandingPadInst(LandingPadInst &I) {
+    // Do nothing.
+    // See http://code.google.com/p/memory-sanitizer/issues/detail?id=1
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &I) {
+    handleShadowOr(I);
+  }
+
+  void visitExtractValueInst(ExtractValueInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Agg = I.getAggregateOperand();
+    DEBUG(dbgs() << "ExtractValue:  " << I << "\n");
+    Value *AggShadow = getShadow(Agg);
+    DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
+    DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n");
+    setShadow(&I, ResShadow);
+    setOriginForNaryOp(I);
+  }
+
+  void visitInsertValueInst(InsertValueInst &I) {
+    IRBuilder<> IRB(&I);
+    DEBUG(dbgs() << "InsertValue:  " << I << "\n");
+    Value *AggShadow = getShadow(I.getAggregateOperand());
+    Value *InsShadow = getShadow(I.getInsertedValueOperand());
+    DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    DEBUG(dbgs() << "   InsShadow:  " << *InsShadow << "\n");
+    Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
+    DEBUG(dbgs() << "   Res:        " << *Res << "\n");
+    setShadow(&I, Res);
+    setOriginForNaryOp(I);
+  }
+
+  void dumpInst(Instruction &I) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      errs() << "ZZZ call " << CI->getCalledFunction()->getName() << "\n";
+    } else {
+      errs() << "ZZZ " << I.getOpcodeName() << "\n";
+    }
+    errs() << "QQQ " << I << "\n";
+  }
+
+  void visitResumeInst(ResumeInst &I) {
+    DEBUG(dbgs() << "Resume: " << I << "\n");
+    // Nothing to do here.
+  }
+
+  void visitInstruction(Instruction &I) {
+    // Everything else: stop propagating and check for poisoned shadow.
+    if (ClDumpStrictInstructions)
+      dumpInst(I);
+    DEBUG(dbgs() << "DEFAULT: " << I << "\n");
+    for (size_t i = 0, n = I.getNumOperands(); i < n; i++)
+      insertShadowCheck(I.getOperand(i), &I);
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+};
+
+/// \brief AMD64-specific implementation of VarArgHelper.
+struct VarArgAMD64Helper : public VarArgHelper {
+  // An unfortunate workaround for asymmetric lowering of va_arg stuff.
+  // See a comment in visitCallSite for more details.
+  static const unsigned AMD64GpEndOffset = 48;  // AMD64 ABI Draft 0.99.6 p3.5.7
+  static const unsigned AMD64FpEndOffset = 176;
+
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy;
+  Value *VAArgOverflowSize;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgAMD64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr),
+      VAArgOverflowSize(nullptr) {}
+
+  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+  ArgKind classifyArgument(Value* arg) {
+    // A very rough approximation of X86_64 argument classification rules.
+    Type *T = arg->getType();
+    if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
+      return AK_FloatingPoint;
+    if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+      return AK_GeneralPurpose;
+    if (T->isPointerTy())
+      return AK_GeneralPurpose;
+    return AK_Memory;
+  }
+
+  // For VarArg functions, store the argument shadow in an ABI-specific format
+  // that corresponds to va_list layout.
+  // We do this because Clang lowers va_arg in the frontend, and this pass
+  // only sees the low level code that deals with va_list internals.
+  // A much easier alternative (provided that Clang emits va_arg instructions)
+  // would have been to associate each live instance of va_list with a copy of
+  // MSanParamTLS, and extract shadow on va_arg() call in the argument list
+  // order.
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    unsigned GpOffset = 0;
+    unsigned FpOffset = AMD64GpEndOffset;
+    unsigned OverflowOffset = AMD64FpEndOffset;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CS.getArgumentNo(ArgIt);
+      bool IsByVal = CS.paramHasAttr(ArgNo + 1, Attribute::ByVal);
+      if (IsByVal) {
+        // ByVal arguments always go to the overflow area.
+        assert(A->getType()->isPointerTy());
+        Type *RealTy = A->getType()->getPointerElementType();
+        uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
+        Value *Base = getShadowPtrForVAArgument(RealTy, IRB, OverflowOffset);
+        OverflowOffset += RoundUpToAlignment(ArgSize, 8);
+        IRB.CreateMemCpy(Base, MSV.getShadowPtr(A, IRB.getInt8Ty(), IRB),
+                         ArgSize, kShadowTLSAlignment);
+      } else {
+        ArgKind AK = classifyArgument(A);
+        if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset)
+          AK = AK_Memory;
+        if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset)
+          AK = AK_Memory;
+        Value *Base;
+        switch (AK) {
+          case AK_GeneralPurpose:
+            Base = getShadowPtrForVAArgument(A->getType(), IRB, GpOffset);
+            GpOffset += 8;
+            break;
+          case AK_FloatingPoint:
+            Base = getShadowPtrForVAArgument(A->getType(), IRB, FpOffset);
+            FpOffset += 16;
+            break;
+          case AK_Memory:
+            uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+            Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset);
+            OverflowOffset += RoundUpToAlignment(ArgSize, 8);
+        }
+        IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+      }
+    }
+    Constant *OverflowSize =
+      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// \brief Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */24, /* alignment */8, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */24, /* alignment */8, false);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+      VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset),
+                      VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+
+      Value *RegSaveAreaPtrPtr =
+        IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 16)),
+          Type::getInt64PtrTy(*MS.C));
+      Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr =
+        MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy,
+                       AMD64FpEndOffset, 16);
+
+      Value *OverflowArgAreaPtrPtr =
+        IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 8)),
+          Type::getInt64PtrTy(*MS.C));
+      Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr);
+      Value *OverflowArgAreaShadowPtr =
+        MSV.getShadowPtr(OverflowArgAreaPtr, IRB.getInt8Ty(), IRB);
+      Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy,
+                                             AMD64FpEndOffset);
+      IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize, 16);
+    }
+  }
+};
+
+/// \brief MIPS64-specific implementation of VarArgHelper.
+struct VarArgMIPS64Helper : public VarArgHelper {
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy;
+  Value *VAArgSize;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgMIPS64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(nullptr),
+      VAArgSize(nullptr) {}
+
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+    unsigned VAArgOffset = 0;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin() + 1, End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      Value *Base;
+      uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+#if defined(__MIPSEB__) || defined(MIPSEB)
+      // Adjusting the shadow for argument with size < 8 to match the placement
+      // of bits in big endian system
+      if (ArgSize < 8)
+        VAArgOffset += (8 - ArgSize);
+#endif
+      Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset);
+      VAArgOffset += ArgSize;
+      VAArgOffset = RoundUpToAlignment(VAArgOffset, 8);
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+    }
+
+    Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset);
+    // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
+    // a new class member i.e. it is the total size of all VarArgs.
+    IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// \brief Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+                                   int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */8, /* alignment */8, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */8, /* alignment */8, false);
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    VAArgSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
+    Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
+                                    VAArgSize);
+
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+      Value *RegSaveAreaPtrPtr =
+        IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        Type::getInt64PtrTy(*MS.C));
+      Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr =
+      MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy, CopySize, 8);
+    }
+  }
+};
+
+/// \brief A no-op implementation of VarArgHelper.
+struct VarArgNoOpHelper : public VarArgHelper {
+  VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
+                   MemorySanitizerVisitor &MSV) {}
+
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {}
+
+  void visitVAStartInst(VAStartInst &I) override {}
+
+  void visitVACopyInst(VACopyInst &I) override {}
+
+  void finalizeInstrumentation() override {}
+};
+
+VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                                 MemorySanitizerVisitor &Visitor) {
+  // VarArg handling is only implemented on AMD64. False positives are possible
+  // on other platforms.
+  llvm::Triple TargetTriple(Func.getParent()->getTargetTriple());
+  if (TargetTriple.getArch() == llvm::Triple::x86_64)
+    return new VarArgAMD64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == llvm::Triple::mips64 ||
+           TargetTriple.getArch() == llvm::Triple::mips64el)
+    return new VarArgMIPS64Helper(Func, Msan, Visitor);
+  else
+    return new VarArgNoOpHelper(Func, Msan, Visitor);
+}
+
+}  // namespace
+
+bool MemorySanitizer::runOnFunction(Function &F) {
+  if (&F == MsanCtorFunction)
+    return false;
+  MemorySanitizerVisitor Visitor(F, *this);
+
+  // Clear out readonly/readnone attributes.
+  AttrBuilder B;
+  B.addAttribute(Attribute::ReadOnly)
+    .addAttribute(Attribute::ReadNone);
+  F.removeAttributes(AttributeSet::FunctionIndex,
+                     AttributeSet::get(F.getContext(),
+                                       AttributeSet::FunctionIndex, B));
+
+  return Visitor.runOnFunction();
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp
new file mode 100644
index 0000000..6b185a2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/SafeStack.cpp
@@ -0,0 +1,609 @@
+//===-- SafeStack.cpp - Safe Stack Insertion ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass splits the stack into the safe stack (kept as-is for LLVM backend)
+// and the unsafe stack (explicitly allocated and managed through the runtime
+// support library).
+//
+// http://clang.llvm.org/docs/SafeStack.html
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "safestack"
+
+namespace llvm {
+
+STATISTIC(NumFunctions, "Total number of functions");
+STATISTIC(NumUnsafeStackFunctions, "Number of functions with unsafe stack");
+STATISTIC(NumUnsafeStackRestorePointsFunctions,
+          "Number of functions that use setjmp or exceptions");
+
+STATISTIC(NumAllocas, "Total number of allocas");
+STATISTIC(NumUnsafeStaticAllocas, "Number of unsafe static allocas");
+STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas");
+STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");
+
+} // namespace llvm
+
+namespace {
+
+/// Check whether a given alloca instruction (AI) should be put on the safe
+/// stack or not. The function analyzes all uses of AI and checks whether it is
+/// only accessed in a memory safe way (as decided statically).
+bool IsSafeStackAlloca(const AllocaInst *AI) {
+  // Go through all uses of this alloca and check whether all accesses to the
+  // allocated object are statically known to be memory safe and, hence, the
+  // object can be placed on the safe stack.
+
+  SmallPtrSet<const Value *, 16> Visited;
+  SmallVector<const Instruction *, 8> WorkList;
+  WorkList.push_back(AI);
+
+  // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc.
+  while (!WorkList.empty()) {
+    const Instruction *V = WorkList.pop_back_val();
+    for (const Use &UI : V->uses()) {
+      auto I = cast<const Instruction>(UI.getUser());
+      assert(V == UI.get());
+
+      switch (I->getOpcode()) {
+      case Instruction::Load:
+        // Loading from a pointer is safe.
+        break;
+      case Instruction::VAArg:
+        // "va-arg" from a pointer is safe.
+        break;
+      case Instruction::Store:
+        if (V == I->getOperand(0))
+          // Stored the pointer - conservatively assume it may be unsafe.
+          return false;
+        // Storing to the pointee is safe.
+        break;
+
+      case Instruction::GetElementPtr:
+        if (!cast<const GetElementPtrInst>(I)->hasAllConstantIndices())
+          // GEP with non-constant indices can lead to memory errors.
+          // This also applies to inbounds GEPs, as the inbounds attribute
+          // represents an assumption that the address is in bounds, rather than
+          // an assertion that it is.
+          return false;
+
+        // We assume that GEP on static alloca with constant indices is safe,
+        // otherwise a compiler would detect it and warn during compilation.
+
+        if (!isa<const ConstantInt>(AI->getArraySize()))
+          // However, if the array size itself is not constant, the access
+          // might still be unsafe at runtime.
+          return false;
+
+      /* fallthrough */
+
+      case Instruction::BitCast:
+      case Instruction::IntToPtr:
+      case Instruction::PHI:
+      case Instruction::PtrToInt:
+      case Instruction::Select:
+        // The object can be safe or not, depending on how the result of the
+        // instruction is used.
+        if (Visited.insert(I).second)
+          WorkList.push_back(cast<const Instruction>(I));
+        break;
+
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        // FIXME: add support for memset and memcpy intrinsics.
+        ImmutableCallSite CS(I);
+
+        // LLVM 'nocapture' attribute is only set for arguments whose address
+        // is not stored, passed around, or used in any other non-trivial way.
+        // We assume that passing a pointer to an object as a 'nocapture'
+        // argument is safe.
+        // FIXME: a more precise solution would require an interprocedural
+        // analysis here, which would look at all uses of an argument inside
+        // the function being called.
+        ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
+        for (ImmutableCallSite::arg_iterator A = B; A != E; ++A)
+          if (A->get() == V && !CS.doesNotCapture(A - B))
+            // The parameter is not marked 'nocapture' - unsafe.
+            return false;
+        continue;
+      }
+
+      default:
+        // The object is unsafe if it is used in any other way.
+        return false;
+      }
+    }
+  }
+
+  // All uses of the alloca are safe, we can place it on the safe stack.
+  return true;
+}
+
+/// The SafeStack pass splits the stack of each function into the
+/// safe stack, which is only accessed through memory safe dereferences
+/// (as determined statically), and the unsafe stack, which contains all
+/// local variables that are accessed in unsafe ways.
+class SafeStack : public FunctionPass {
+  const DataLayout *DL;
+
+  Type *StackPtrTy;
+  Type *IntPtrTy;
+  Type *Int32Ty;
+  Type *Int8Ty;
+
+  Constant *UnsafeStackPtr = nullptr;
+
+  /// Unsafe stack alignment. Each stack frame must ensure that the stack is
+  /// aligned to this value. We need to re-align the unsafe stack if the
+  /// alignment of any object on the stack exceeds this value.
+  ///
+  /// 16 seems like a reasonable upper bound on the alignment of objects that we
+  /// might expect to appear on the stack on most common targets.
+  enum { StackAlignment = 16 };
+
+  /// \brief Build a constant representing a pointer to the unsafe stack
+  /// pointer.
+  Constant *getOrCreateUnsafeStackPtr(Module &M);
+
+  /// \brief Find all static allocas, dynamic allocas, return instructions and
+  /// stack restore points (exception unwind blocks and setjmp calls) in the
+  /// given function and append them to the respective vectors.
+  void findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas,
+                 SmallVectorImpl<AllocaInst *> &DynamicAllocas,
+                 SmallVectorImpl<ReturnInst *> &Returns,
+                 SmallVectorImpl<Instruction *> &StackRestorePoints);
+
+  /// \brief Allocate space for all static allocas in \p StaticAllocas,
+  /// replace allocas with pointers into the unsafe stack and generate code to
+  /// restore the stack pointer before all return instructions in \p Returns.
+  ///
+  /// \returns A pointer to the top of the unsafe stack after all unsafe static
+  /// allocas are allocated.
+  Value *moveStaticAllocasToUnsafeStack(Function &F,
+                                        ArrayRef<AllocaInst *> StaticAllocas,
+                                        ArrayRef<ReturnInst *> Returns);
+
+  /// \brief Generate code to restore the stack after all stack restore points
+  /// in \p StackRestorePoints.
+  ///
+  /// \returns A local variable in which to maintain the dynamic top of the
+  /// unsafe stack if needed.
+  AllocaInst *
+  createStackRestorePoints(Function &F,
+                           ArrayRef<Instruction *> StackRestorePoints,
+                           Value *StaticTop, bool NeedDynamicTop);
+
+  /// \brief Replace all allocas in \p DynamicAllocas with code to allocate
+  /// space dynamically on the unsafe stack and store the dynamic unsafe stack
+  /// top to \p DynamicTop if non-null.
+  void moveDynamicAllocasToUnsafeStack(Function &F, Value *UnsafeStackPtr,
+                                       AllocaInst *DynamicTop,
+                                       ArrayRef<AllocaInst *> DynamicAllocas);
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  SafeStack() : FunctionPass(ID), DL(nullptr) {
+    initializeSafeStackPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<AliasAnalysis>();
+  }
+
+  virtual bool doInitialization(Module &M) {
+    DL = &M.getDataLayout();
+
+    StackPtrTy = Type::getInt8PtrTy(M.getContext());
+    IntPtrTy = DL->getIntPtrType(M.getContext());
+    Int32Ty = Type::getInt32Ty(M.getContext());
+    Int8Ty = Type::getInt8Ty(M.getContext());
+
+    return false;
+  }
+
+  bool runOnFunction(Function &F);
+
+}; // class SafeStack
+
+Constant *SafeStack::getOrCreateUnsafeStackPtr(Module &M) {
+  // The unsafe stack pointer is stored in a global variable with a magic name.
+  const char *kUnsafeStackPtrVar = "__safestack_unsafe_stack_ptr";
+
+  auto UnsafeStackPtr =
+      dyn_cast_or_null<GlobalVariable>(M.getNamedValue(kUnsafeStackPtrVar));
+
+  if (!UnsafeStackPtr) {
+    // The global variable is not defined yet, define it ourselves.
+    // We use the initial-exec TLS model because we do not support the variable
+    // living anywhere other than in the main executable.
+    UnsafeStackPtr = new GlobalVariable(
+        /*Module=*/M, /*Type=*/StackPtrTy,
+        /*isConstant=*/false, /*Linkage=*/GlobalValue::ExternalLinkage,
+        /*Initializer=*/0, /*Name=*/kUnsafeStackPtrVar,
+        /*InsertBefore=*/nullptr,
+        /*ThreadLocalMode=*/GlobalValue::InitialExecTLSModel);
+  } else {
+    // The variable exists, check its type and attributes.
+    if (UnsafeStackPtr->getValueType() != StackPtrTy) {
+      report_fatal_error(Twine(kUnsafeStackPtrVar) + " must have void* type");
+    }
+
+    if (!UnsafeStackPtr->isThreadLocal()) {
+      report_fatal_error(Twine(kUnsafeStackPtrVar) + " must be thread-local");
+    }
+  }
+
+  return UnsafeStackPtr;
+}
+
+void SafeStack::findInsts(Function &F,
+                          SmallVectorImpl<AllocaInst *> &StaticAllocas,
+                          SmallVectorImpl<AllocaInst *> &DynamicAllocas,
+                          SmallVectorImpl<ReturnInst *> &Returns,
+                          SmallVectorImpl<Instruction *> &StackRestorePoints) {
+  for (Instruction &I : inst_range(&F)) {
+    if (auto AI = dyn_cast<AllocaInst>(&I)) {
+      ++NumAllocas;
+
+      if (IsSafeStackAlloca(AI))
+        continue;
+
+      if (AI->isStaticAlloca()) {
+        ++NumUnsafeStaticAllocas;
+        StaticAllocas.push_back(AI);
+      } else {
+        ++NumUnsafeDynamicAllocas;
+        DynamicAllocas.push_back(AI);
+      }
+    } else if (auto RI = dyn_cast<ReturnInst>(&I)) {
+      Returns.push_back(RI);
+    } else if (auto CI = dyn_cast<CallInst>(&I)) {
+      // setjmps require stack restore.
+      if (CI->getCalledFunction() && CI->canReturnTwice())
+        StackRestorePoints.push_back(CI);
+    } else if (auto LP = dyn_cast<LandingPadInst>(&I)) {
+      // Exception landing pads require stack restore.
+      StackRestorePoints.push_back(LP);
+    } else if (auto II = dyn_cast<IntrinsicInst>(&I)) {
+      if (II->getIntrinsicID() == Intrinsic::gcroot)
+        llvm::report_fatal_error(
+            "gcroot intrinsic not compatible with safestack attribute");
+    }
+  }
+}
+
+AllocaInst *
+SafeStack::createStackRestorePoints(Function &F,
+                                    ArrayRef<Instruction *> StackRestorePoints,
+                                    Value *StaticTop, bool NeedDynamicTop) {
+  if (StackRestorePoints.empty())
+    return nullptr;
+
+  IRBuilder<> IRB(StaticTop
+                      ? cast<Instruction>(StaticTop)->getNextNode()
+                      : (Instruction *)F.getEntryBlock().getFirstInsertionPt());
+
+  // We need the current value of the shadow stack pointer to restore
+  // after longjmp or exception catching.
+
+  // FIXME: On some platforms this could be handled by the longjmp/exception
+  // runtime itself.
+
+  AllocaInst *DynamicTop = nullptr;
+  if (NeedDynamicTop)
+    // If we also have dynamic alloca's, the stack pointer value changes
+    // throughout the function. For now we store it in an alloca.
+    DynamicTop = IRB.CreateAlloca(StackPtrTy, /*ArraySize=*/nullptr,
+                                  "unsafe_stack_dynamic_ptr");
+
+  if (!StaticTop)
+    // We need the original unsafe stack pointer value, even if there are
+    // no unsafe static allocas.
+    StaticTop = IRB.CreateLoad(UnsafeStackPtr, false, "unsafe_stack_ptr");
+
+  if (NeedDynamicTop)
+    IRB.CreateStore(StaticTop, DynamicTop);
+
+  // Restore current stack pointer after longjmp/exception catch.
+  for (Instruction *I : StackRestorePoints) {
+    ++NumUnsafeStackRestorePoints;
+
+    IRB.SetInsertPoint(cast<Instruction>(I->getNextNode()));
+    Value *CurrentTop = DynamicTop ? IRB.CreateLoad(DynamicTop) : StaticTop;
+    IRB.CreateStore(CurrentTop, UnsafeStackPtr);
+  }
+
+  return DynamicTop;
+}
+
+Value *
+SafeStack::moveStaticAllocasToUnsafeStack(Function &F,
+                                          ArrayRef<AllocaInst *> StaticAllocas,
+                                          ArrayRef<ReturnInst *> Returns) {
+  if (StaticAllocas.empty())
+    return nullptr;
+
+  IRBuilder<> IRB(F.getEntryBlock().getFirstInsertionPt());
+  DIBuilder DIB(*F.getParent());
+
+  // We explicitly compute and set the unsafe stack layout for all unsafe
+  // static alloca instructions. We save the unsafe "base pointer" in the
+  // prologue into a local variable and restore it in the epilogue.
+
+  // Load the current stack pointer (we'll also use it as a base pointer).
+  // FIXME: use a dedicated register for it ?
+  Instruction *BasePointer =
+      IRB.CreateLoad(UnsafeStackPtr, false, "unsafe_stack_ptr");
+  assert(BasePointer->getType() == StackPtrTy);
+
+  for (ReturnInst *RI : Returns) {
+    IRB.SetInsertPoint(RI);
+    IRB.CreateStore(BasePointer, UnsafeStackPtr);
+  }
+
+  // Compute maximum alignment among static objects on the unsafe stack.
+  unsigned MaxAlignment = 0;
+  for (AllocaInst *AI : StaticAllocas) {
+    Type *Ty = AI->getAllocatedType();
+    unsigned Align =
+        std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment());
+    if (Align > MaxAlignment)
+      MaxAlignment = Align;
+  }
+
+  if (MaxAlignment > StackAlignment) {
+    // Re-align the base pointer according to the max requested alignment.
+    assert(isPowerOf2_32(MaxAlignment));
+    IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode()));
+    BasePointer = cast<Instruction>(IRB.CreateIntToPtr(
+        IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy),
+                      ConstantInt::get(IntPtrTy, ~uint64_t(MaxAlignment - 1))),
+        StackPtrTy));
+  }
+
+  // Allocate space for every unsafe static AllocaInst on the unsafe stack.
+  int64_t StaticOffset = 0; // Current stack top.
+  for (AllocaInst *AI : StaticAllocas) {
+    IRB.SetInsertPoint(AI);
+
+    auto CArraySize = cast<ConstantInt>(AI->getArraySize());
+    Type *Ty = AI->getAllocatedType();
+
+    uint64_t Size = DL->getTypeAllocSize(Ty) * CArraySize->getZExtValue();
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    // Ensure the object is properly aligned.
+    unsigned Align =
+        std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment());
+
+    // Add alignment.
+    // NOTE: we ensure that BasePointer itself is aligned to >= Align.
+    StaticOffset += Size;
+    StaticOffset = RoundUpToAlignment(StaticOffset, Align);
+
+    Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8*
+                               ConstantInt::get(Int32Ty, -StaticOffset));
+    Value *NewAI = IRB.CreateBitCast(Off, AI->getType(), AI->getName());
+    if (AI->hasName() && isa<Instruction>(NewAI))
+      cast<Instruction>(NewAI)->takeName(AI);
+
+    // Replace alloc with the new location.
+    replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/true);
+    AI->replaceAllUsesWith(NewAI);
+    AI->eraseFromParent();
+  }
+
+  // Re-align BasePointer so that our callees would see it aligned as
+  // expected.
+  // FIXME: no need to update BasePointer in leaf functions.
+  StaticOffset = RoundUpToAlignment(StaticOffset, StackAlignment);
+
+  // Update shadow stack pointer in the function epilogue.
+  IRB.SetInsertPoint(cast<Instruction>(BasePointer->getNextNode()));
+
+  Value *StaticTop =
+      IRB.CreateGEP(BasePointer, ConstantInt::get(Int32Ty, -StaticOffset),
+                    "unsafe_stack_static_top");
+  IRB.CreateStore(StaticTop, UnsafeStackPtr);
+  return StaticTop;
+}
+
+void SafeStack::moveDynamicAllocasToUnsafeStack(
+    Function &F, Value *UnsafeStackPtr, AllocaInst *DynamicTop,
+    ArrayRef<AllocaInst *> DynamicAllocas) {
+  DIBuilder DIB(*F.getParent());
+
+  for (AllocaInst *AI : DynamicAllocas) {
+    IRBuilder<> IRB(AI);
+
+    // Compute the new SP value (after AI).
+    Value *ArraySize = AI->getArraySize();
+    if (ArraySize->getType() != IntPtrTy)
+      ArraySize = IRB.CreateIntCast(ArraySize, IntPtrTy, false);
+
+    Type *Ty = AI->getAllocatedType();
+    uint64_t TySize = DL->getTypeAllocSize(Ty);
+    Value *Size = IRB.CreateMul(ArraySize, ConstantInt::get(IntPtrTy, TySize));
+
+    Value *SP = IRB.CreatePtrToInt(IRB.CreateLoad(UnsafeStackPtr), IntPtrTy);
+    SP = IRB.CreateSub(SP, Size);
+
+    // Align the SP value to satisfy the AllocaInst, type and stack alignments.
+    unsigned Align = std::max(
+        std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()),
+        (unsigned)StackAlignment);
+
+    assert(isPowerOf2_32(Align));
+    Value *NewTop = IRB.CreateIntToPtr(
+        IRB.CreateAnd(SP, ConstantInt::get(IntPtrTy, ~uint64_t(Align - 1))),
+        StackPtrTy);
+
+    // Save the stack pointer.
+    IRB.CreateStore(NewTop, UnsafeStackPtr);
+    if (DynamicTop)
+      IRB.CreateStore(NewTop, DynamicTop);
+
+    Value *NewAI = IRB.CreateIntToPtr(SP, AI->getType());
+    if (AI->hasName() && isa<Instruction>(NewAI))
+      NewAI->takeName(AI);
+
+    replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/true);
+    AI->replaceAllUsesWith(NewAI);
+    AI->eraseFromParent();
+  }
+
+  if (!DynamicAllocas.empty()) {
+    // Now go through the instructions again, replacing stacksave/stackrestore.
+    for (inst_iterator It = inst_begin(&F), Ie = inst_end(&F); It != Ie;) {
+      Instruction *I = &*(It++);
+      auto II = dyn_cast<IntrinsicInst>(I);
+      if (!II)
+        continue;
+
+      if (II->getIntrinsicID() == Intrinsic::stacksave) {
+        IRBuilder<> IRB(II);
+        Instruction *LI = IRB.CreateLoad(UnsafeStackPtr);
+        LI->takeName(II);
+        II->replaceAllUsesWith(LI);
+        II->eraseFromParent();
+      } else if (II->getIntrinsicID() == Intrinsic::stackrestore) {
+        IRBuilder<> IRB(II);
+        Instruction *SI = IRB.CreateStore(II->getArgOperand(0), UnsafeStackPtr);
+        SI->takeName(II);
+        assert(II->use_empty());
+        II->eraseFromParent();
+      }
+    }
+  }
+}
+
+bool SafeStack::runOnFunction(Function &F) {
+  auto AA = &getAnalysis<AliasAnalysis>();
+
+  DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
+
+  if (!F.hasFnAttribute(Attribute::SafeStack)) {
+    DEBUG(dbgs() << "[SafeStack]     safestack is not requested"
+                    " for this function\n");
+    return false;
+  }
+
+  if (F.isDeclaration()) {
+    DEBUG(dbgs() << "[SafeStack]     function definition"
+                    " is not available\n");
+    return false;
+  }
+
+  {
+    // Make sure the regular stack protector won't run on this function
+    // (safestack attribute takes precedence).
+    AttrBuilder B;
+    B.addAttribute(Attribute::StackProtect)
+        .addAttribute(Attribute::StackProtectReq)
+        .addAttribute(Attribute::StackProtectStrong);
+    F.removeAttributes(
+        AttributeSet::FunctionIndex,
+        AttributeSet::get(F.getContext(), AttributeSet::FunctionIndex, B));
+  }
+
+  if (AA->onlyReadsMemory(&F)) {
+    // XXX: we don't protect against information leak attacks for now.
+    DEBUG(dbgs() << "[SafeStack]     function only reads memory\n");
+    return false;
+  }
+
+  ++NumFunctions;
+
+  SmallVector<AllocaInst *, 16> StaticAllocas;
+  SmallVector<AllocaInst *, 4> DynamicAllocas;
+  SmallVector<ReturnInst *, 4> Returns;
+
+  // Collect all points where stack gets unwound and needs to be restored
+  // This is only necessary because the runtime (setjmp and unwind code) is
+  // not aware of the unsafe stack and won't unwind/restore it prorerly.
+  // To work around this problem without changing the runtime, we insert
+  // instrumentation to restore the unsafe stack pointer when necessary.
+  SmallVector<Instruction *, 4> StackRestorePoints;
+
+  // Find all static and dynamic alloca instructions that must be moved to the
+  // unsafe stack, all return instructions and stack restore points.
+  findInsts(F, StaticAllocas, DynamicAllocas, Returns, StackRestorePoints);
+
+  if (StaticAllocas.empty() && DynamicAllocas.empty() &&
+      StackRestorePoints.empty())
+    return false; // Nothing to do in this function.
+
+  if (!StaticAllocas.empty() || !DynamicAllocas.empty())
+    ++NumUnsafeStackFunctions; // This function has the unsafe stack.
+
+  if (!StackRestorePoints.empty())
+    ++NumUnsafeStackRestorePointsFunctions;
+
+  if (!UnsafeStackPtr)
+    UnsafeStackPtr = getOrCreateUnsafeStackPtr(*F.getParent());
+
+  // The top of the unsafe stack after all unsafe static allocas are allocated.
+  Value *StaticTop = moveStaticAllocasToUnsafeStack(F, StaticAllocas, Returns);
+
+  // Safe stack object that stores the current unsafe stack top. It is updated
+  // as unsafe dynamic (non-constant-sized) allocas are allocated and freed.
+  // This is only needed if we need to restore stack pointer after longjmp
+  // or exceptions, and we have dynamic allocations.
+  // FIXME: a better alternative might be to store the unsafe stack pointer
+  // before setjmp / invoke instructions.
+  AllocaInst *DynamicTop = createStackRestorePoints(
+      F, StackRestorePoints, StaticTop, !DynamicAllocas.empty());
+
+  // Handle dynamic allocas.
+  moveDynamicAllocasToUnsafeStack(F, UnsafeStackPtr, DynamicTop,
+                                  DynamicAllocas);
+
+  DEBUG(dbgs() << "[SafeStack]     safestack applied\n");
+  return true;
+}
+
+} // end anonymous namespace
+
+char SafeStack::ID = 0;
+INITIALIZE_PASS_BEGIN(SafeStack, "safe-stack",
+                      "Safe Stack instrumentation pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SafeStack, "safe-stack", "Safe Stack instrumentation pass",
+                    false, false)
+
+FunctionPass *llvm::createSafeStackPass() { return new SafeStack(); }
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
new file mode 100644
index 0000000..7a5b4cb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -0,0 +1,457 @@
+//===-- SanitizerCoverage.cpp - coverage instrumentation for sanitizers ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Coverage instrumentation that works with AddressSanitizer
+// and potentially with other Sanitizers.
+//
+// We create a Guard variable with the same linkage
+// as the function and inject this code into the entry block (SCK_Function)
+// or all blocks (SCK_BB):
+// if (Guard < 0) {
+//    __sanitizer_cov(&Guard);
+// }
+// The accesses to Guard are atomic. The rest of the logic is
+// in __sanitizer_cov (it's fine to call it more than once).
+//
+// With SCK_Edge we also split critical edges this effectively
+// instrumenting all edges.
+//
+// This coverage implementation provides very limited data:
+// it only tells if a given function (block) was ever executed. No counters.
+// But for many use cases this is what we need and the added slowdown small.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sancov"
+
+static const char *const kSanCovModuleInitName = "__sanitizer_cov_module_init";
+static const char *const kSanCovName = "__sanitizer_cov";
+static const char *const kSanCovWithCheckName = "__sanitizer_cov_with_check";
+static const char *const kSanCovIndirCallName = "__sanitizer_cov_indir_call16";
+static const char *const kSanCovTraceEnter = "__sanitizer_cov_trace_func_enter";
+static const char *const kSanCovTraceBB = "__sanitizer_cov_trace_basic_block";
+static const char *const kSanCovTraceCmp = "__sanitizer_cov_trace_cmp";
+static const char *const kSanCovModuleCtorName = "sancov.module_ctor";
+static const uint64_t    kSanCtorAndDtorPriority = 2;
+
+static cl::opt<int> ClCoverageLevel("sanitizer-coverage-level",
+       cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, "
+                "3: all blocks and critical edges, "
+                "4: above plus indirect calls"),
+       cl::Hidden, cl::init(0));
+
+static cl::opt<unsigned> ClCoverageBlockThreshold(
+    "sanitizer-coverage-block-threshold",
+    cl::desc("Use a callback with a guard check inside it if there are"
+             " more than this number of blocks."),
+    cl::Hidden, cl::init(500));
+
+static cl::opt<bool>
+    ClExperimentalTracing("sanitizer-coverage-experimental-tracing",
+                          cl::desc("Experimental basic-block tracing: insert "
+                                   "callbacks at every basic block"),
+                          cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+    ClExperimentalCMPTracing("sanitizer-coverage-experimental-trace-compares",
+                             cl::desc("Experimental tracing of CMP and similar "
+                                      "instructions"),
+                             cl::Hidden, cl::init(false));
+
+// Experimental 8-bit counters used as an additional search heuristic during
+// coverage-guided fuzzing.
+// The counters are not thread-friendly:
+//   - contention on these counters may cause significant slowdown;
+//   - the counter updates are racy and the results may be inaccurate.
+// They are also inaccurate due to 8-bit integer overflow.
+static cl::opt<bool> ClUse8bitCounters("sanitizer-coverage-8bit-counters",
+                                       cl::desc("Experimental 8-bit counters"),
+                                       cl::Hidden, cl::init(false));
+
+namespace {
+
+SanitizerCoverageOptions getOptions(int LegacyCoverageLevel) {
+  SanitizerCoverageOptions Res;
+  switch (LegacyCoverageLevel) {
+  case 0:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_None;
+    break;
+  case 1:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_Function;
+    break;
+  case 2:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_BB;
+    break;
+  case 3:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_Edge;
+    break;
+  case 4:
+    Res.CoverageType = SanitizerCoverageOptions::SCK_Edge;
+    Res.IndirectCalls = true;
+    break;
+  }
+  return Res;
+}
+
+SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
+  // Sets CoverageType and IndirectCalls.
+  SanitizerCoverageOptions CLOpts = getOptions(ClCoverageLevel);
+  Options.CoverageType = std::max(Options.CoverageType, CLOpts.CoverageType);
+  Options.IndirectCalls |= CLOpts.IndirectCalls;
+  Options.TraceBB |= ClExperimentalTracing;
+  Options.TraceCmp |= ClExperimentalCMPTracing;
+  Options.Use8bitCounters |= ClUse8bitCounters;
+  return Options;
+}
+
+class SanitizerCoverageModule : public ModulePass {
+ public:
+  SanitizerCoverageModule(
+      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions())
+      : ModulePass(ID), Options(OverrideFromCL(Options)) {}
+  bool runOnModule(Module &M) override;
+  bool runOnFunction(Function &F);
+  static char ID;  // Pass identification, replacement for typeid
+  const char *getPassName() const override {
+    return "SanitizerCoverageModule";
+  }
+
+ private:
+  void InjectCoverageForIndirectCalls(Function &F,
+                                      ArrayRef<Instruction *> IndirCalls);
+  void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets);
+  bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks);
+  void SetNoSanitizeMetadata(Instruction *I);
+  void InjectCoverageAtBlock(Function &F, BasicBlock &BB, bool UseCalls);
+  unsigned NumberOfInstrumentedBlocks() {
+    return SanCovFunction->getNumUses() + SanCovWithCheckFunction->getNumUses();
+  }
+  Function *SanCovFunction;
+  Function *SanCovWithCheckFunction;
+  Function *SanCovIndirCallFunction;
+  Function *SanCovTraceEnter, *SanCovTraceBB;
+  Function *SanCovTraceCmpFunction;
+  InlineAsm *EmptyAsm;
+  Type *IntptrTy, *Int64Ty;
+  LLVMContext *C;
+  const DataLayout *DL;
+
+  GlobalVariable *GuardArray;
+  GlobalVariable *EightBitCounterArray;
+
+  SanitizerCoverageOptions Options;
+};
+
+}  // namespace
+
+bool SanitizerCoverageModule::runOnModule(Module &M) {
+  if (Options.CoverageType == SanitizerCoverageOptions::SCK_None)
+    return false;
+  C = &(M.getContext());
+  DL = &M.getDataLayout();
+  IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
+  Type *VoidTy = Type::getVoidTy(*C);
+  IRBuilder<> IRB(*C);
+  Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
+  Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
+  Int64Ty = IRB.getInt64Ty();
+
+  SanCovFunction = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kSanCovName, VoidTy, Int32PtrTy, nullptr));
+  SanCovWithCheckFunction = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kSanCovWithCheckName, VoidTy, Int32PtrTy, nullptr));
+  SanCovIndirCallFunction =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          kSanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr));
+  SanCovTraceCmpFunction =
+      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+          kSanCovTraceCmp, VoidTy, Int64Ty, Int64Ty, Int64Ty, nullptr));
+
+  // We insert an empty inline asm after cov callbacks to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+
+  if (Options.TraceBB) {
+    SanCovTraceEnter = checkSanitizerInterfaceFunction(
+        M.getOrInsertFunction(kSanCovTraceEnter, VoidTy, Int32PtrTy, nullptr));
+    SanCovTraceBB = checkSanitizerInterfaceFunction(
+        M.getOrInsertFunction(kSanCovTraceBB, VoidTy, Int32PtrTy, nullptr));
+  }
+
+  // At this point we create a dummy array of guards because we don't
+  // know how many elements we will need.
+  Type *Int32Ty = IRB.getInt32Ty();
+  Type *Int8Ty = IRB.getInt8Ty();
+
+  GuardArray =
+      new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
+                         nullptr, "__sancov_gen_cov_tmp");
+  if (Options.Use8bitCounters)
+    EightBitCounterArray =
+        new GlobalVariable(M, Int8Ty, false, GlobalVariable::ExternalLinkage,
+                           nullptr, "__sancov_gen_cov_tmp");
+
+  for (auto &F : M)
+    runOnFunction(F);
+
+  auto N = NumberOfInstrumentedBlocks();
+
+  // Now we know how many elements we need. Create an array of guards
+  // with one extra element at the beginning for the size.
+  Type *Int32ArrayNTy = ArrayType::get(Int32Ty, N + 1);
+  GlobalVariable *RealGuardArray = new GlobalVariable(
+      M, Int32ArrayNTy, false, GlobalValue::PrivateLinkage,
+      Constant::getNullValue(Int32ArrayNTy), "__sancov_gen_cov");
+
+
+  // Replace the dummy array with the real one.
+  GuardArray->replaceAllUsesWith(
+      IRB.CreatePointerCast(RealGuardArray, Int32PtrTy));
+  GuardArray->eraseFromParent();
+
+  GlobalVariable *RealEightBitCounterArray;
+  if (Options.Use8bitCounters) {
+    // Make sure the array is 16-aligned.
+    static const int kCounterAlignment = 16;
+    Type *Int8ArrayNTy =
+        ArrayType::get(Int8Ty, RoundUpToAlignment(N, kCounterAlignment));
+    RealEightBitCounterArray = new GlobalVariable(
+        M, Int8ArrayNTy, false, GlobalValue::PrivateLinkage,
+        Constant::getNullValue(Int8ArrayNTy), "__sancov_gen_cov_counter");
+    RealEightBitCounterArray->setAlignment(kCounterAlignment);
+    EightBitCounterArray->replaceAllUsesWith(
+        IRB.CreatePointerCast(RealEightBitCounterArray, Int8PtrTy));
+    EightBitCounterArray->eraseFromParent();
+  }
+
+  // Create variable for module (compilation unit) name
+  Constant *ModNameStrConst =
+      ConstantDataArray::getString(M.getContext(), M.getName(), true);
+  GlobalVariable *ModuleName =
+      new GlobalVariable(M, ModNameStrConst->getType(), true,
+                         GlobalValue::PrivateLinkage, ModNameStrConst);
+
+  Function *CtorFunc;
+  std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, kSanCovModuleCtorName, kSanCovModuleInitName,
+      {Int32PtrTy, IntptrTy, Int8PtrTy, Int8PtrTy},
+      {IRB.CreatePointerCast(RealGuardArray, Int32PtrTy),
+       ConstantInt::get(IntptrTy, N),
+       Options.Use8bitCounters
+           ? IRB.CreatePointerCast(RealEightBitCounterArray, Int8PtrTy)
+           : Constant::getNullValue(Int8PtrTy),
+       IRB.CreatePointerCast(ModuleName, Int8PtrTy)});
+
+  appendToGlobalCtors(M, CtorFunc, kSanCtorAndDtorPriority);
+
+  return true;
+}
+
+bool SanitizerCoverageModule::runOnFunction(Function &F) {
+  if (F.empty()) return false;
+  if (F.getName().find(".module_ctor") != std::string::npos)
+    return false;  // Should not instrument sanitizer init functions.
+  if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge)
+    SplitAllCriticalEdges(F);
+  SmallVector<Instruction*, 8> IndirCalls;
+  SmallVector<BasicBlock*, 16> AllBlocks;
+  SmallVector<Instruction*, 8> CmpTraceTargets;
+  for (auto &BB : F) {
+    AllBlocks.push_back(&BB);
+    for (auto &Inst : BB) {
+      if (Options.IndirectCalls) {
+        CallSite CS(&Inst);
+        if (CS && !CS.getCalledFunction())
+          IndirCalls.push_back(&Inst);
+      }
+      if (Options.TraceCmp && isa<ICmpInst>(&Inst))
+        CmpTraceTargets.push_back(&Inst);
+    }
+  }
+  InjectCoverage(F, AllBlocks);
+  InjectCoverageForIndirectCalls(F, IndirCalls);
+  InjectTraceForCmp(F, CmpTraceTargets);
+  return true;
+}
+
+bool SanitizerCoverageModule::InjectCoverage(Function &F,
+                                             ArrayRef<BasicBlock *> AllBlocks) {
+  switch (Options.CoverageType) {
+  case SanitizerCoverageOptions::SCK_None:
+    return false;
+  case SanitizerCoverageOptions::SCK_Function:
+    InjectCoverageAtBlock(F, F.getEntryBlock(), false);
+    return true;
+  default: {
+    bool UseCalls = ClCoverageBlockThreshold < AllBlocks.size();
+    for (auto BB : AllBlocks)
+      InjectCoverageAtBlock(F, *BB, UseCalls);
+    return true;
+  }
+  }
+}
+
+// On every indirect call we call a run-time function
+// __sanitizer_cov_indir_call* with two parameters:
+//   - callee address,
+//   - global cache array that contains kCacheSize pointers (zero-initialized).
+//     The cache is used to speed up recording the caller-callee pairs.
+// The address of the caller is passed implicitly via caller PC.
+// kCacheSize is encoded in the name of the run-time function.
+void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
+    Function &F, ArrayRef<Instruction *> IndirCalls) {
+  if (IndirCalls.empty()) return;
+  const int kCacheSize = 16;
+  const int kCacheAlignment = 64;  // Align for better performance.
+  Type *Ty = ArrayType::get(IntptrTy, kCacheSize);
+  for (auto I : IndirCalls) {
+    IRBuilder<> IRB(I);
+    CallSite CS(I);
+    Value *Callee = CS.getCalledValue();
+    if (isa<InlineAsm>(Callee)) continue;
+    GlobalVariable *CalleeCache = new GlobalVariable(
+        *F.getParent(), Ty, false, GlobalValue::PrivateLinkage,
+        Constant::getNullValue(Ty), "__sancov_gen_callee_cache");
+    CalleeCache->setAlignment(kCacheAlignment);
+    IRB.CreateCall(SanCovIndirCallFunction,
+                   {IRB.CreatePointerCast(Callee, IntptrTy),
+                    IRB.CreatePointerCast(CalleeCache, IntptrTy)});
+  }
+}
+
+void SanitizerCoverageModule::InjectTraceForCmp(
+    Function &F, ArrayRef<Instruction *> CmpTraceTargets) {
+  for (auto I : CmpTraceTargets) {
+    if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) {
+      IRBuilder<> IRB(ICMP);
+      Value *A0 = ICMP->getOperand(0);
+      Value *A1 = ICMP->getOperand(1);
+      if (!A0->getType()->isIntegerTy()) continue;
+      uint64_t TypeSize = DL->getTypeStoreSizeInBits(A0->getType());
+      // __sanitizer_cov_trace_cmp((type_size << 32) | predicate, A0, A1);
+      IRB.CreateCall(
+          SanCovTraceCmpFunction,
+          {ConstantInt::get(Int64Ty, (TypeSize << 32) | ICMP->getPredicate()),
+           IRB.CreateIntCast(A0, Int64Ty, true),
+           IRB.CreateIntCast(A1, Int64Ty, true)});
+    }
+  }
+}
+
+void SanitizerCoverageModule::SetNoSanitizeMetadata(Instruction *I) {
+  I->setMetadata(
+      I->getParent()->getParent()->getParent()->getMDKindID("nosanitize"),
+      MDNode::get(*C, None));
+}
+
+void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
+                                                    bool UseCalls) {
+  // Don't insert coverage for unreachable blocks: we will never call
+  // __sanitizer_cov() for them, so counting them in
+  // NumberOfInstrumentedBlocks() might complicate calculation of code coverage
+  // percentage. Also, unreachable instructions frequently have no debug
+  // locations.
+  if (isa<UnreachableInst>(BB.getTerminator()))
+    return;
+  BasicBlock::iterator IP = BB.getFirstInsertionPt(), BE = BB.end();
+  // Skip static allocas at the top of the entry block so they don't become
+  // dynamic when we split the block.  If we used our optimized stack layout,
+  // then there will only be one alloca and it will come first.
+  for (; IP != BE; ++IP) {
+    AllocaInst *AI = dyn_cast<AllocaInst>(IP);
+    if (!AI || !AI->isStaticAlloca())
+      break;
+  }
+
+  bool IsEntryBB = &BB == &F.getEntryBlock();
+  DebugLoc EntryLoc;
+  if (IsEntryBB) {
+    if (auto SP = getDISubprogram(&F))
+      EntryLoc = DebugLoc::get(SP->getScopeLine(), 0, SP);
+  } else {
+    EntryLoc = IP->getDebugLoc();
+  }
+
+  IRBuilder<> IRB(IP);
+  IRB.SetCurrentDebugLocation(EntryLoc);
+  SmallVector<Value *, 1> Indices;
+  Value *GuardP = IRB.CreateAdd(
+      IRB.CreatePointerCast(GuardArray, IntptrTy),
+      ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4));
+  Type *Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
+  GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy);
+  if (UseCalls) {
+    IRB.CreateCall(SanCovWithCheckFunction, GuardP);
+  } else {
+    LoadInst *Load = IRB.CreateLoad(GuardP);
+    Load->setAtomic(Monotonic);
+    Load->setAlignment(4);
+    SetNoSanitizeMetadata(Load);
+    Value *Cmp = IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load);
+    Instruction *Ins = SplitBlockAndInsertIfThen(
+        Cmp, IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
+    IRB.SetInsertPoint(Ins);
+    IRB.SetCurrentDebugLocation(EntryLoc);
+    // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC.
+    IRB.CreateCall(SanCovFunction, GuardP);
+    IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
+  }
+
+  if (Options.Use8bitCounters) {
+    IRB.SetInsertPoint(IP);
+    Value *P = IRB.CreateAdd(
+        IRB.CreatePointerCast(EightBitCounterArray, IntptrTy),
+        ConstantInt::get(IntptrTy, NumberOfInstrumentedBlocks() - 1));
+    P = IRB.CreateIntToPtr(P, IRB.getInt8PtrTy());
+    LoadInst *LI = IRB.CreateLoad(P);
+    Value *Inc = IRB.CreateAdd(LI, ConstantInt::get(IRB.getInt8Ty(), 1));
+    StoreInst *SI = IRB.CreateStore(Inc, P);
+    SetNoSanitizeMetadata(LI);
+    SetNoSanitizeMetadata(SI);
+  }
+
+  if (Options.TraceBB) {
+    // Experimental support for tracing.
+    // Insert a callback with the same guard variable as used for coverage.
+    IRB.SetInsertPoint(IP);
+    IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP);
+  }
+}
+
+char SanitizerCoverageModule::ID = 0;
+INITIALIZE_PASS(SanitizerCoverageModule, "sancov",
+    "SanitizerCoverage: TODO."
+    "ModulePass", false, false)
+ModulePass *llvm::createSanitizerCoverageModulePass(
+    const SanitizerCoverageOptions &Options) {
+  return new SanitizerCoverageModule(Options);
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
new file mode 100644
index 0000000..1a46bbb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -0,0 +1,603 @@
+//===-- ThreadSanitizer.cpp - race detector -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer, a race detector.
+//
+// The tool is under development, for the details about previous versions see
+// http://code.google.com/p/data-race-test
+//
+// The instrumentation phase is quite simple:
+//   - Insert calls to run-time library before every memory access.
+//      - Optimizations may apply to avoid instrumenting some of the accesses.
+//   - Insert calls at function entry/exit.
+// The rest is handled by the run-time library.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tsan"
+
+static cl::opt<bool>  ClInstrumentMemoryAccesses(
+    "tsan-instrument-memory-accesses", cl::init(true),
+    cl::desc("Instrument memory accesses"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentFuncEntryExit(
+    "tsan-instrument-func-entry-exit", cl::init(true),
+    cl::desc("Instrument function entry and exit"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentAtomics(
+    "tsan-instrument-atomics", cl::init(true),
+    cl::desc("Instrument atomics"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemIntrinsics(
+    "tsan-instrument-memintrinsics", cl::init(true),
+    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOmittedReadsBeforeWrite,
+          "Number of reads ignored due to following writes");
+STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
+STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
+STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads");
+STATISTIC(NumOmittedReadsFromConstantGlobals,
+          "Number of reads from constant globals");
+STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
+STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing");
+
+static const char *const kTsanModuleCtorName = "tsan.module_ctor";
+static const char *const kTsanInitName = "__tsan_init";
+
+namespace {
+
+/// ThreadSanitizer: instrument the code in module to find races.
+struct ThreadSanitizer : public FunctionPass {
+  ThreadSanitizer() : FunctionPass(ID) {}
+  const char *getPassName() const override;
+  bool runOnFunction(Function &F) override;
+  bool doInitialization(Module &M) override;
+  static char ID;  // Pass identification, replacement for typeid.
+
+ private:
+  void initializeCallbacks(Module &M);
+  bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL);
+  bool instrumentAtomic(Instruction *I, const DataLayout &DL);
+  bool instrumentMemIntrinsic(Instruction *I);
+  void chooseInstructionsToInstrument(SmallVectorImpl<Instruction *> &Local,
+                                      SmallVectorImpl<Instruction *> &All,
+                                      const DataLayout &DL);
+  bool addrPointsToConstantData(Value *Addr);
+  int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
+
+  Type *IntptrTy;
+  IntegerType *OrdTy;
+  // Callbacks to run-time library are computed in doInitialization.
+  Function *TsanFuncEntry;
+  Function *TsanFuncExit;
+  // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+  static const size_t kNumberOfAccessSizes = 5;
+  Function *TsanRead[kNumberOfAccessSizes];
+  Function *TsanWrite[kNumberOfAccessSizes];
+  Function *TsanUnalignedRead[kNumberOfAccessSizes];
+  Function *TsanUnalignedWrite[kNumberOfAccessSizes];
+  Function *TsanAtomicLoad[kNumberOfAccessSizes];
+  Function *TsanAtomicStore[kNumberOfAccessSizes];
+  Function *TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1][kNumberOfAccessSizes];
+  Function *TsanAtomicCAS[kNumberOfAccessSizes];
+  Function *TsanAtomicThreadFence;
+  Function *TsanAtomicSignalFence;
+  Function *TsanVptrUpdate;
+  Function *TsanVptrLoad;
+  Function *MemmoveFn, *MemcpyFn, *MemsetFn;
+  Function *TsanCtorFunction;
+};
+}  // namespace
+
+char ThreadSanitizer::ID = 0;
+INITIALIZE_PASS(ThreadSanitizer, "tsan",
+    "ThreadSanitizer: detects data races.",
+    false, false)
+
+const char *ThreadSanitizer::getPassName() const {
+  return "ThreadSanitizer";
+}
+
+FunctionPass *llvm::createThreadSanitizerPass() {
+  return new ThreadSanitizer();
+}
+
+void ThreadSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(M.getContext());
+  // Initialize the callbacks.
+  TsanFuncEntry = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+  TsanFuncExit = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("__tsan_func_exit", IRB.getVoidTy(), nullptr));
+  OrdTy = IRB.getInt32Ty();
+  for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
+    const size_t ByteSize = 1 << i;
+    const size_t BitSize = ByteSize * 8;
+    SmallString<32> ReadName("__tsan_read" + itostr(ByteSize));
+    TsanRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+        ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+
+    SmallString<32> WriteName("__tsan_write" + itostr(ByteSize));
+    TsanWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+        WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+
+    SmallString<64> UnalignedReadName("__tsan_unaligned_read" +
+        itostr(ByteSize));
+    TsanUnalignedRead[i] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            UnalignedReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+
+    SmallString<64> UnalignedWriteName("__tsan_unaligned_write" +
+        itostr(ByteSize));
+    TsanUnalignedWrite[i] =
+        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+            UnalignedWriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+
+    Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) +
+                                   "_load");
+    TsanAtomicLoad[i] = checkSanitizerInterfaceFunction(
+        M.getOrInsertFunction(AtomicLoadName, Ty, PtrTy, OrdTy, nullptr));
+
+    SmallString<32> AtomicStoreName("__tsan_atomic" + itostr(BitSize) +
+                                    "_store");
+    TsanAtomicStore[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+        AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy, nullptr));
+
+    for (int op = AtomicRMWInst::FIRST_BINOP;
+        op <= AtomicRMWInst::LAST_BINOP; ++op) {
+      TsanAtomicRMW[op][i] = nullptr;
+      const char *NamePart = nullptr;
+      if (op == AtomicRMWInst::Xchg)
+        NamePart = "_exchange";
+      else if (op == AtomicRMWInst::Add)
+        NamePart = "_fetch_add";
+      else if (op == AtomicRMWInst::Sub)
+        NamePart = "_fetch_sub";
+      else if (op == AtomicRMWInst::And)
+        NamePart = "_fetch_and";
+      else if (op == AtomicRMWInst::Or)
+        NamePart = "_fetch_or";
+      else if (op == AtomicRMWInst::Xor)
+        NamePart = "_fetch_xor";
+      else if (op == AtomicRMWInst::Nand)
+        NamePart = "_fetch_nand";
+      else
+        continue;
+      SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
+      TsanAtomicRMW[op][i] = checkSanitizerInterfaceFunction(
+          M.getOrInsertFunction(RMWName, Ty, PtrTy, Ty, OrdTy, nullptr));
+    }
+
+    SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) +
+                                  "_compare_exchange_val");
+    TsanAtomicCAS[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+        AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, nullptr));
+  }
+  TsanVptrUpdate = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("__tsan_vptr_update", IRB.getVoidTy(),
+                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), nullptr));
+  TsanVptrLoad = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_vptr_read", IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+  TsanAtomicThreadFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, nullptr));
+  TsanAtomicSignalFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, nullptr));
+
+  MemmoveFn = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+  MemcpyFn = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+  MemsetFn = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt32Ty(), IntptrTy, nullptr));
+}
+
+bool ThreadSanitizer::doInitialization(Module &M) {
+  const DataLayout &DL = M.getDataLayout();
+  IntptrTy = DL.getIntPtrType(M.getContext());
+  std::tie(TsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{});
+
+  appendToGlobalCtors(M, TsanCtorFunction, 0);
+
+  return true;
+}
+
+static bool isVtableAccess(Instruction *I) {
+  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
+    return Tag->isTBAAVtableAccess();
+  return false;
+}
+
+bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
+  // If this is a GEP, just analyze its pointer operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
+    Addr = GEP->getPointerOperand();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->isConstant()) {
+      // Reads from constant globals can not race with any writes.
+      NumOmittedReadsFromConstantGlobals++;
+      return true;
+    }
+  } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+    if (isVtableAccess(L)) {
+      // Reads from a vtable pointer can not race with any writes.
+      NumOmittedReadsFromVtable++;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Instrumenting some of the accesses may be proven redundant.
+// Currently handled:
+//  - read-before-write (within same BB, no calls between)
+//  - not captured variables
+//
+// We do not handle some of the patterns that should not survive
+// after the classic compiler optimizations.
+// E.g. two reads from the same temp should be eliminated by CSE,
+// two writes should be eliminated by DSE, etc.
+//
+// 'Local' is a vector of insns within the same BB (no calls between).
+// 'All' is a vector of insns that will be instrumented.
+void ThreadSanitizer::chooseInstructionsToInstrument(
+    SmallVectorImpl<Instruction *> &Local, SmallVectorImpl<Instruction *> &All,
+    const DataLayout &DL) {
+  SmallSet<Value*, 8> WriteTargets;
+  // Iterate from the end.
+  for (SmallVectorImpl<Instruction*>::reverse_iterator It = Local.rbegin(),
+       E = Local.rend(); It != E; ++It) {
+    Instruction *I = *It;
+    if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
+      WriteTargets.insert(Store->getPointerOperand());
+    } else {
+      LoadInst *Load = cast<LoadInst>(I);
+      Value *Addr = Load->getPointerOperand();
+      if (WriteTargets.count(Addr)) {
+        // We will write to this temp, so no reason to analyze the read.
+        NumOmittedReadsBeforeWrite++;
+        continue;
+      }
+      if (addrPointsToConstantData(Addr)) {
+        // Addr points to some constant data -- it can not race with any writes.
+        continue;
+      }
+    }
+    Value *Addr = isa<StoreInst>(*I)
+        ? cast<StoreInst>(I)->getPointerOperand()
+        : cast<LoadInst>(I)->getPointerOperand();
+    if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) &&
+        !PointerMayBeCaptured(Addr, true, true)) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+      continue;
+    }
+    All.push_back(I);
+  }
+  Local.clear();
+}
+
+static bool isAtomic(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isAtomic() && LI->getSynchScope() == CrossThread;
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isAtomic() && SI->getSynchScope() == CrossThread;
+  if (isa<AtomicRMWInst>(I))
+    return true;
+  if (isa<AtomicCmpXchgInst>(I))
+    return true;
+  if (isa<FenceInst>(I))
+    return true;
+  return false;
+}
+
+bool ThreadSanitizer::runOnFunction(Function &F) {
+  // This is required to prevent instrumenting call to __tsan_init from within
+  // the module constructor.
+  if (&F == TsanCtorFunction)
+    return false;
+  initializeCallbacks(*F.getParent());
+  SmallVector<Instruction*, 8> RetVec;
+  SmallVector<Instruction*, 8> AllLoadsAndStores;
+  SmallVector<Instruction*, 8> LocalLoadsAndStores;
+  SmallVector<Instruction*, 8> AtomicAccesses;
+  SmallVector<Instruction*, 8> MemIntrinCalls;
+  bool Res = false;
+  bool HasCalls = false;
+  bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  // Traverse all instructions, collect loads/stores/returns, check for calls.
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (isAtomic(&Inst))
+        AtomicAccesses.push_back(&Inst);
+      else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+        LocalLoadsAndStores.push_back(&Inst);
+      else if (isa<ReturnInst>(Inst))
+        RetVec.push_back(&Inst);
+      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+        if (isa<MemIntrinsic>(Inst))
+          MemIntrinCalls.push_back(&Inst);
+        HasCalls = true;
+        chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores,
+                                       DL);
+      }
+    }
+    chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, DL);
+  }
+
+  // We have collected all loads and stores.
+  // FIXME: many of these accesses do not need to be checked for races
+  // (e.g. variables that do not escape, etc).
+
+  // Instrument memory accesses only if we want to report bugs in the function.
+  if (ClInstrumentMemoryAccesses && SanitizeFunction)
+    for (auto Inst : AllLoadsAndStores) {
+      Res |= instrumentLoadOrStore(Inst, DL);
+    }
+
+  // Instrument atomic memory accesses in any case (they can be used to
+  // implement synchronization).
+  if (ClInstrumentAtomics)
+    for (auto Inst : AtomicAccesses) {
+      Res |= instrumentAtomic(Inst, DL);
+    }
+
+  if (ClInstrumentMemIntrinsics && SanitizeFunction)
+    for (auto Inst : MemIntrinCalls) {
+      Res |= instrumentMemIntrinsic(Inst);
+    }
+
+  // Instrument function entry/exit points if there were instrumented accesses.
+  if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
+    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    Value *ReturnAddress = IRB.CreateCall(
+        Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
+        IRB.getInt32(0));
+    IRB.CreateCall(TsanFuncEntry, ReturnAddress);
+    for (auto RetInst : RetVec) {
+      IRBuilder<> IRBRet(RetInst);
+      IRBRet.CreateCall(TsanFuncExit, {});
+    }
+    Res = true;
+  }
+  return Res;
+}
+
+bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
+                                            const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  bool IsWrite = isa<StoreInst>(*I);
+  Value *Addr = IsWrite
+      ? cast<StoreInst>(I)->getPointerOperand()
+      : cast<LoadInst>(I)->getPointerOperand();
+  int Idx = getMemoryAccessFuncIndex(Addr, DL);
+  if (Idx < 0)
+    return false;
+  if (IsWrite && isVtableAccess(I)) {
+    DEBUG(dbgs() << "  VPTR : " << *I << "\n");
+    Value *StoredValue = cast<StoreInst>(I)->getValueOperand();
+    // StoredValue may be a vector type if we are storing several vptrs at once.
+    // In this case, just take the first element of the vector since this is
+    // enough to find vptr races.
+    if (isa<VectorType>(StoredValue->getType()))
+      StoredValue = IRB.CreateExtractElement(
+          StoredValue, ConstantInt::get(IRB.getInt32Ty(), 0));
+    if (StoredValue->getType()->isIntegerTy())
+      StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy());
+    // Call TsanVptrUpdate.
+    IRB.CreateCall(TsanVptrUpdate,
+                   {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                    IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy())});
+    NumInstrumentedVtableWrites++;
+    return true;
+  }
+  if (!IsWrite && isVtableAccess(I)) {
+    IRB.CreateCall(TsanVptrLoad,
+                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+    NumInstrumentedVtableReads++;
+    return true;
+  }
+  const unsigned Alignment = IsWrite
+      ? cast<StoreInst>(I)->getAlignment()
+      : cast<LoadInst>(I)->getAlignment();
+  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+  const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  Value *OnAccessFunc = nullptr;
+  if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0)
+    OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
+  else
+    OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx];
+  IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+  if (IsWrite) NumInstrumentedWrites++;
+  else         NumInstrumentedReads++;
+  return true;
+}
+
+static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
+  uint32_t v = 0;
+  switch (ord) {
+    case NotAtomic: llvm_unreachable("unexpected atomic ordering!");
+    case Unordered:              // Fall-through.
+    case Monotonic:              v = 0; break;
+    // case Consume:                v = 1; break;  // Not specified yet.
+    case Acquire:                v = 2; break;
+    case Release:                v = 3; break;
+    case AcquireRelease:         v = 4; break;
+    case SequentiallyConsistent: v = 5; break;
+  }
+  return IRB->getInt32(v);
+}
+
+// If a memset intrinsic gets inlined by the code gen, we will miss races on it.
+// So, we either need to ensure the intrinsic is not inlined, or instrument it.
+// We do not instrument memset/memmove/memcpy intrinsics (too complicated),
+// instead we simply replace them with regular function calls, which are then
+// intercepted by the run-time.
+// Since tsan is running after everyone else, the calls should not be
+// replaced back with intrinsics. If that becomes wrong at some point,
+// we will need to call e.g. __tsan_memset to avoid the intrinsics.
+bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+    IRB.CreateCall(
+        MemsetFn,
+        {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    I->eraseFromParent();
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+    IRB.CreateCall(
+        isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn,
+        {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    I->eraseFromParent();
+  }
+  return false;
+}
+
+// Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x
+// standards.  For background see C++11 standard.  A slightly older, publicly
+// available draft of the standard (not entirely up-to-date, but close enough
+// for casual browsing) is available here:
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2011/n3242.pdf
+// The following page contains more background information:
+// http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/
+
+bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    Value *Addr = LI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     createOrdering(&IRB, LI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicLoad[Idx], Args);
+    ReplaceInstWithInst(I, C);
+
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    Value *Addr = SI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(SI->getValueOperand(), Ty, false),
+                     createOrdering(&IRB, SI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicStore[Idx], Args);
+    ReplaceInstWithInst(I, C);
+  } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    Value *Addr = RMWI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx];
+    if (!F)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
+                     createOrdering(&IRB, RMWI->getOrdering())};
+    CallInst *C = CallInst::Create(F, Args);
+    ReplaceInstWithInst(I, C);
+  } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    Value *Addr = CASI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr, DL);
+    if (Idx < 0)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(CASI->getCompareOperand(), Ty, false),
+                     IRB.CreateIntCast(CASI->getNewValOperand(), Ty, false),
+                     createOrdering(&IRB, CASI->getSuccessOrdering()),
+                     createOrdering(&IRB, CASI->getFailureOrdering())};
+    CallInst *C = IRB.CreateCall(TsanAtomicCAS[Idx], Args);
+    Value *Success = IRB.CreateICmpEQ(C, CASI->getCompareOperand());
+
+    Value *Res = IRB.CreateInsertValue(UndefValue::get(CASI->getType()), C, 0);
+    Res = IRB.CreateInsertValue(Res, Success, 1);
+
+    I->replaceAllUsesWith(Res);
+    I->eraseFromParent();
+  } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
+    Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
+    Function *F = FI->getSynchScope() == SingleThread ?
+        TsanAtomicSignalFence : TsanAtomicThreadFence;
+    CallInst *C = CallInst::Create(F, Args);
+    ReplaceInstWithInst(I, C);
+  }
+  return true;
+}
+
+int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr,
+                                              const DataLayout &DL) {
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  if (TypeSize != 8  && TypeSize != 16 &&
+      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
+    NumAccessesWithBadSize++;
+    // Ignore all unusual sizes.
+    return -1;
+  }
+  size_t Idx = countTrailingZeros(TypeSize / 8);
+  assert(Idx < kNumberOfAccessSizes);
+  return Idx;
+}